gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67 #include "selftest.h"
  68 #include "selftest-rtl.h"
  69
  70 /* This file should be included last.  */
  71 #include "target-def.h"
  72
  73 /* Defined for convenience.  */
  74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  75
  76 /* Classifies an address.
  77
  78    ADDRESS_REG_IMM
  79        A simple base register plus immediate offset.
  80
  81    ADDRESS_REG_WB
  82        A base register indexed by immediate offset with writeback.
  83
  84    ADDRESS_REG_REG
  85        A base register indexed by (optionally scaled) register.
  86
  87    ADDRESS_REG_UXTW
  88        A base register indexed by (optionally scaled) zero-extended register.
  89
  90    ADDRESS_REG_SXTW
  91        A base register indexed by (optionally scaled) sign-extended register.
  92
  93    ADDRESS_LO_SUM
  94        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  95
  96    ADDRESS_SYMBOLIC:
  97        A constant symbolic address, in pc-relative literal pool.  */
  98
  99 enum aarch64_address_type {
 100   ADDRESS_REG_IMM,
 101   ADDRESS_REG_WB,
 102   ADDRESS_REG_REG,
 103   ADDRESS_REG_UXTW,
 104   ADDRESS_REG_SXTW,
 105   ADDRESS_LO_SUM,
 106   ADDRESS_SYMBOLIC
 107 };
 108
 109 struct aarch64_address_info {
 110   enum aarch64_address_type type;
 111   rtx base;
 112   rtx offset;
 113   int shift;
 114   enum aarch64_symbol_type symbol_type;
 115 };
 116
 117 struct simd_immediate_info
 118 {
 119   rtx value;
 120   int shift;
 121   int element_width;
 122   bool mvn;
 123   bool msl;
 124 };
 125
 126 /* The current code model.  */
 127 enum aarch64_code_model aarch64_cmodel;
 128
 129 #ifdef HAVE_AS_TLS
 130 #undef TARGET_HAVE_TLS
 131 #define TARGET_HAVE_TLS 1
 132 #endif
 133
 134 static bool aarch64_composite_type_p (const_tree, machine_mode);
 135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 136                                                      const_tree,
 137                                                      machine_mode *, int *,
 138                                                      bool *);
 139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_override_options_after_change (void);
 142 static bool aarch64_vector_mode_supported_p (machine_mode);
 143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 144                                                  const unsigned char *sel);
 145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 147                                                          const_tree type,
 148                                                          int misalignment,
 149                                                          bool is_packed);
 150
 151 /* Major revision number of the ARM Architecture implemented by the target.  */
 152 unsigned aarch64_architecture_version;
 153
 154 /* The processor for which instructions should be scheduled.  */
 155 enum aarch64_processor aarch64_tune = cortexa53;
 156
 157 /* Mask to specify which instruction scheduling options should be used.  */
 158 unsigned long aarch64_tune_flags = 0;
 159
 160 /* Global flag for PC relative loads.  */
 161 bool aarch64_pcrelative_literal_loads;
 162
 163 /* Support for command line parsing of boolean flags in the tuning
 164    structures.  */
 165 struct aarch64_flag_desc
 166 {
 167   const char* name;
 168   unsigned int flag;
 169 };
 170
 171 #define AARCH64_FUSION_PAIR(name, internal_name) \
 172   { name, AARCH64_FUSE_##internal_name },
 173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 174 {
 175   { "none", AARCH64_FUSE_NOTHING },
 176 #include "aarch64-fusion-pairs.def"
 177   { "all", AARCH64_FUSE_ALL },
 178   { NULL, AARCH64_FUSE_NOTHING }
 179 };
 180
 181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 182   { name, AARCH64_EXTRA_TUNE_##internal_name },
 183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 184 {
 185   { "none", AARCH64_EXTRA_TUNE_NONE },
 186 #include "aarch64-tuning-flags.def"
 187   { "all", AARCH64_EXTRA_TUNE_ALL },
 188   { NULL, AARCH64_EXTRA_TUNE_NONE }
 189 };
 190
 191 /* Tuning parameters.  */
 192
 193 static const struct cpu_addrcost_table generic_addrcost_table =
 194 {
 195     {
 196       0, /* hi  */
 197       0, /* si  */
 198       0, /* di  */
 199       0, /* ti  */
 200     },
 201   0, /* pre_modify  */
 202   0, /* post_modify  */
 203   0, /* register_offset  */
 204   0, /* register_sextend  */
 205   0, /* register_zextend  */
 206   0 /* imm_offset  */
 207 };
 208
 209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 210 {
 211     {
 212       1, /* hi  */
 213       0, /* si  */
 214       0, /* di  */
 215       1, /* ti  */
 216     },
 217   0, /* pre_modify  */
 218   0, /* post_modify  */
 219   0, /* register_offset  */
 220   0, /* register_sextend  */
 221   0, /* register_zextend  */
 222   0, /* imm_offset  */
 223 };
 224
 225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 226 {
 227     {
 228       0, /* hi  */
 229       0, /* si  */
 230       0, /* di  */
 231       2, /* ti  */
 232     },
 233   0, /* pre_modify  */
 234   0, /* post_modify  */
 235   1, /* register_offset  */
 236   1, /* register_sextend  */
 237   2, /* register_zextend  */
 238   0, /* imm_offset  */
 239 };
 240
 241 static const struct cpu_addrcost_table xgene1_addrcost_table =
 242 {
 243     {
 244       1, /* hi  */
 245       0, /* si  */
 246       0, /* di  */
 247       1, /* ti  */
 248     },
 249   1, /* pre_modify  */
 250   0, /* post_modify  */
 251   0, /* register_offset  */
 252   1, /* register_sextend  */
 253   1, /* register_zextend  */
 254   0, /* imm_offset  */
 255 };
 256
 257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 274 {
 275     {
 276       1, /* hi  */
 277       1, /* si  */
 278       1, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   2, /* register_offset  */
 284   3, /* register_sextend  */
 285   3, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_regmove_cost generic_regmove_cost =
 290 {
 291   1, /* GP2GP  */
 292   /* Avoid the use of slow int<->fp moves for spilling by setting
 293      their cost higher than memmov_cost.  */
 294   5, /* GP2FP  */
 295   5, /* FP2GP  */
 296   2 /* FP2FP  */
 297 };
 298
 299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 300 {
 301   1, /* GP2GP  */
 302   /* Avoid the use of slow int<->fp moves for spilling by setting
 303      their cost higher than memmov_cost.  */
 304   5, /* GP2FP  */
 305   5, /* FP2GP  */
 306   2 /* FP2FP  */
 307 };
 308
 309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 310 {
 311   1, /* GP2GP  */
 312   /* Avoid the use of slow int<->fp moves for spilling by setting
 313      their cost higher than memmov_cost.  */
 314   5, /* GP2FP  */
 315   5, /* FP2GP  */
 316   2 /* FP2FP  */
 317 };
 318
 319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 320 {
 321   1, /* GP2GP  */
 322   /* Avoid the use of slow int<->fp moves for spilling by setting
 323      their cost higher than memmov_cost (actual, 4 and 9).  */
 324   9, /* GP2FP  */
 325   9, /* FP2GP  */
 326   1 /* FP2FP  */
 327 };
 328
 329 static const struct cpu_regmove_cost thunderx_regmove_cost =
 330 {
 331   2, /* GP2GP  */
 332   2, /* GP2FP  */
 333   6, /* FP2GP  */
 334   4 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost xgene1_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   8, /* GP2FP  */
 343   8, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 348 {
 349   2, /* GP2GP  */
 350   /* Avoid the use of int<->fp moves for spilling.  */
 351   6, /* GP2FP  */
 352   6, /* FP2GP  */
 353   4 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of int<->fp moves for spilling.  */
 360   8, /* GP2FP  */
 361   8, /* FP2GP  */
 362   4  /* FP2FP  */
 363 };
 364
 365 /* Generic costs for vector insn classes.  */
 366 static const struct cpu_vector_cost generic_vector_cost =
 367 {
 368   1, /* scalar_stmt_cost  */
 369   1, /* scalar_load_cost  */
 370   1, /* scalar_store_cost  */
 371   1, /* vec_stmt_cost  */
 372   2, /* vec_permute_cost  */
 373   1, /* vec_to_scalar_cost  */
 374   1, /* scalar_to_vec_cost  */
 375   1, /* vec_align_load_cost  */
 376   1, /* vec_unalign_load_cost  */
 377   1, /* vec_unalign_store_cost  */
 378   1, /* vec_store_cost  */
 379   3, /* cond_taken_branch_cost  */
 380   1 /* cond_not_taken_branch_cost  */
 381 };
 382
 383 /* ThunderX costs for vector insn classes.  */
 384 static const struct cpu_vector_cost thunderx_vector_cost =
 385 {
 386   1, /* scalar_stmt_cost  */
 387   3, /* scalar_load_cost  */
 388   1, /* scalar_store_cost  */
 389   4, /* vec_stmt_cost  */
 390   4, /* vec_permute_cost  */
 391   2, /* vec_to_scalar_cost  */
 392   2, /* scalar_to_vec_cost  */
 393   3, /* vec_align_load_cost  */
 394   10, /* vec_unalign_load_cost  */
 395   10, /* vec_unalign_store_cost  */
 396   1, /* vec_store_cost  */
 397   3, /* cond_taken_branch_cost  */
 398   3 /* cond_not_taken_branch_cost  */
 399 };
 400
 401 /* Generic costs for vector insn classes.  */
 402 static const struct cpu_vector_cost cortexa57_vector_cost =
 403 {
 404   1, /* scalar_stmt_cost  */
 405   4, /* scalar_load_cost  */
 406   1, /* scalar_store_cost  */
 407   2, /* vec_stmt_cost  */
 408   3, /* vec_permute_cost  */
 409   8, /* vec_to_scalar_cost  */
 410   8, /* scalar_to_vec_cost  */
 411   4, /* vec_align_load_cost  */
 412   4, /* vec_unalign_load_cost  */
 413   1, /* vec_unalign_store_cost  */
 414   1, /* vec_store_cost  */
 415   1, /* cond_taken_branch_cost  */
 416   1 /* cond_not_taken_branch_cost  */
 417 };
 418
 419 static const struct cpu_vector_cost exynosm1_vector_cost =
 420 {
 421   1, /* scalar_stmt_cost  */
 422   5, /* scalar_load_cost  */
 423   1, /* scalar_store_cost  */
 424   3, /* vec_stmt_cost  */
 425   3, /* vec_permute_cost  */
 426   3, /* vec_to_scalar_cost  */
 427   3, /* scalar_to_vec_cost  */
 428   5, /* vec_align_load_cost  */
 429   5, /* vec_unalign_load_cost  */
 430   1, /* vec_unalign_store_cost  */
 431   1, /* vec_store_cost  */
 432   1, /* cond_taken_branch_cost  */
 433   1 /* cond_not_taken_branch_cost  */
 434 };
 435
 436 /* Generic costs for vector insn classes.  */
 437 static const struct cpu_vector_cost xgene1_vector_cost =
 438 {
 439   1, /* scalar_stmt_cost  */
 440   5, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   2, /* vec_stmt_cost  */
 443   2, /* vec_permute_cost  */
 444   4, /* vec_to_scalar_cost  */
 445   4, /* scalar_to_vec_cost  */
 446   10, /* vec_align_load_cost  */
 447   10, /* vec_unalign_load_cost  */
 448   2, /* vec_unalign_store_cost  */
 449   2, /* vec_store_cost  */
 450   2, /* cond_taken_branch_cost  */
 451   1 /* cond_not_taken_branch_cost  */
 452 };
 453
 454 /* Costs for vector insn classes for Vulcan.  */
 455 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 456 {
 457   6, /* scalar_stmt_cost  */
 458   4, /* scalar_load_cost  */
 459   1, /* scalar_store_cost  */
 460   6, /* vec_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   6, /* vec_to_scalar_cost  */
 463   5, /* scalar_to_vec_cost  */
 464   8, /* vec_align_load_cost  */
 465   8, /* vec_unalign_load_cost  */
 466   4, /* vec_unalign_store_cost  */
 467   4, /* vec_store_cost  */
 468   2, /* cond_taken_branch_cost  */
 469   1  /* cond_not_taken_branch_cost  */
 470 };
 471
 472 /* Generic costs for branch instructions.  */
 473 static const struct cpu_branch_cost generic_branch_cost =
 474 {
 475   2,  /* Predictable.  */
 476   2   /* Unpredictable.  */
 477 };
 478
 479 /* Branch costs for Cortex-A57.  */
 480 static const struct cpu_branch_cost cortexa57_branch_cost =
 481 {
 482   1,  /* Predictable.  */
 483   3   /* Unpredictable.  */
 484 };
 485
 486 /* Branch costs for Vulcan.  */
 487 static const struct cpu_branch_cost thunderx2t99_branch_cost =
 488 {
 489   1,  /* Predictable.  */
 490   3   /* Unpredictable.  */
 491 };
 492
 493 /* Generic approximation modes.  */
 494 static const cpu_approx_modes generic_approx_modes =
 495 {
 496   AARCH64_APPROX_NONE,  /* division  */
 497   AARCH64_APPROX_NONE,  /* sqrt  */
 498   AARCH64_APPROX_NONE   /* recip_sqrt  */
 499 };
 500
 501 /* Approximation modes for Exynos M1.  */
 502 static const cpu_approx_modes exynosm1_approx_modes =
 503 {
 504   AARCH64_APPROX_NONE,  /* division  */
 505   AARCH64_APPROX_ALL,   /* sqrt  */
 506   AARCH64_APPROX_ALL    /* recip_sqrt  */
 507 };
 508
 509 /* Approximation modes for X-Gene 1.  */
 510 static const cpu_approx_modes xgene1_approx_modes =
 511 {
 512   AARCH64_APPROX_NONE,  /* division  */
 513   AARCH64_APPROX_NONE,  /* sqrt  */
 514   AARCH64_APPROX_ALL    /* recip_sqrt  */
 515 };
 516
 517 static const struct tune_params generic_tunings =
 518 {
 519   &cortexa57_extra_costs,
 520   &generic_addrcost_table,
 521   &generic_regmove_cost,
 522   &generic_vector_cost,
 523   &generic_branch_cost,
 524   &generic_approx_modes,
 525   4, /* memmov_cost  */
 526   2, /* issue_rate  */
 527   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 528   8,    /* function_align.  */
 529   8,    /* jump_align.  */
 530   4,    /* loop_align.  */
 531   2,    /* int_reassoc_width.  */
 532   4,    /* fp_reassoc_width.  */
 533   1,    /* vec_reassoc_width.  */
 534   2,    /* min_div_recip_mul_sf.  */
 535   2,    /* min_div_recip_mul_df.  */
 536   0,    /* max_case_values.  */
 537   0,    /* cache_line_size.  */
 538   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 539   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 540 };
 541
 542 static const struct tune_params cortexa35_tunings =
 543 {
 544   &cortexa53_extra_costs,
 545   &generic_addrcost_table,
 546   &cortexa53_regmove_cost,
 547   &generic_vector_cost,
 548   &cortexa57_branch_cost,
 549   &generic_approx_modes,
 550   4, /* memmov_cost  */
 551   1, /* issue_rate  */
 552   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 553    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 554   16,   /* function_align.  */
 555   8,    /* jump_align.  */
 556   8,    /* loop_align.  */
 557   2,    /* int_reassoc_width.  */
 558   4,    /* fp_reassoc_width.  */
 559   1,    /* vec_reassoc_width.  */
 560   2,    /* min_div_recip_mul_sf.  */
 561   2,    /* min_div_recip_mul_df.  */
 562   0,    /* max_case_values.  */
 563   0,    /* cache_line_size.  */
 564   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 565   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 566 };
 567
 568 static const struct tune_params cortexa53_tunings =
 569 {
 570   &cortexa53_extra_costs,
 571   &generic_addrcost_table,
 572   &cortexa53_regmove_cost,
 573   &generic_vector_cost,
 574   &cortexa57_branch_cost,
 575   &generic_approx_modes,
 576   4, /* memmov_cost  */
 577   2, /* issue_rate  */
 578   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 579    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 580   16,   /* function_align.  */
 581   8,    /* jump_align.  */
 582   8,    /* loop_align.  */
 583   2,    /* int_reassoc_width.  */
 584   4,    /* fp_reassoc_width.  */
 585   1,    /* vec_reassoc_width.  */
 586   2,    /* min_div_recip_mul_sf.  */
 587   2,    /* min_div_recip_mul_df.  */
 588   0,    /* max_case_values.  */
 589   0,    /* cache_line_size.  */
 590   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 591   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 592 };
 593
 594 static const struct tune_params cortexa57_tunings =
 595 {
 596   &cortexa57_extra_costs,
 597   &cortexa57_addrcost_table,
 598   &cortexa57_regmove_cost,
 599   &cortexa57_vector_cost,
 600   &cortexa57_branch_cost,
 601   &generic_approx_modes,
 602   4, /* memmov_cost  */
 603   3, /* issue_rate  */
 604   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 605    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 606   16,   /* function_align.  */
 607   8,    /* jump_align.  */
 608   8,    /* loop_align.  */
 609   2,    /* int_reassoc_width.  */
 610   4,    /* fp_reassoc_width.  */
 611   1,    /* vec_reassoc_width.  */
 612   2,    /* min_div_recip_mul_sf.  */
 613   2,    /* min_div_recip_mul_df.  */
 614   0,    /* max_case_values.  */
 615   0,    /* cache_line_size.  */
 616   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 617   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 618 };
 619
 620 static const struct tune_params cortexa72_tunings =
 621 {
 622   &cortexa57_extra_costs,
 623   &cortexa57_addrcost_table,
 624   &cortexa57_regmove_cost,
 625   &cortexa57_vector_cost,
 626   &cortexa57_branch_cost,
 627   &generic_approx_modes,
 628   4, /* memmov_cost  */
 629   3, /* issue_rate  */
 630   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 631    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 632   16,   /* function_align.  */
 633   8,    /* jump_align.  */
 634   8,    /* loop_align.  */
 635   2,    /* int_reassoc_width.  */
 636   4,    /* fp_reassoc_width.  */
 637   1,    /* vec_reassoc_width.  */
 638   2,    /* min_div_recip_mul_sf.  */
 639   2,    /* min_div_recip_mul_df.  */
 640   0,    /* max_case_values.  */
 641   0,    /* cache_line_size.  */
 642   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 643   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 644 };
 645
 646 static const struct tune_params cortexa73_tunings =
 647 {
 648   &cortexa57_extra_costs,
 649   &cortexa57_addrcost_table,
 650   &cortexa57_regmove_cost,
 651   &cortexa57_vector_cost,
 652   &cortexa57_branch_cost,
 653   &generic_approx_modes,
 654   4, /* memmov_cost.  */
 655   2, /* issue_rate.  */
 656   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 657    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 658   16,   /* function_align.  */
 659   8,    /* jump_align.  */
 660   8,    /* loop_align.  */
 661   2,    /* int_reassoc_width.  */
 662   4,    /* fp_reassoc_width.  */
 663   1,    /* vec_reassoc_width.  */
 664   2,    /* min_div_recip_mul_sf.  */
 665   2,    /* min_div_recip_mul_df.  */
 666   0,    /* max_case_values.  */
 667   0,    /* cache_line_size.  */
 668   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 669   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 670 };
 671
 672 static const struct tune_params exynosm1_tunings =
 673 {
 674   &exynosm1_extra_costs,
 675   &exynosm1_addrcost_table,
 676   &exynosm1_regmove_cost,
 677   &exynosm1_vector_cost,
 678   &generic_branch_cost,
 679   &exynosm1_approx_modes,
 680   4,    /* memmov_cost  */
 681   3,    /* issue_rate  */
 682   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 683   4,    /* function_align.  */
 684   4,    /* jump_align.  */
 685   4,    /* loop_align.  */
 686   2,    /* int_reassoc_width.  */
 687   4,    /* fp_reassoc_width.  */
 688   1,    /* vec_reassoc_width.  */
 689   2,    /* min_div_recip_mul_sf.  */
 690   2,    /* min_div_recip_mul_df.  */
 691   48,   /* max_case_values.  */
 692   64,   /* cache_line_size.  */
 693   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 694   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 695 };
 696
 697 static const struct tune_params thunderx_tunings =
 698 {
 699   &thunderx_extra_costs,
 700   &generic_addrcost_table,
 701   &thunderx_regmove_cost,
 702   &thunderx_vector_cost,
 703   &generic_branch_cost,
 704   &generic_approx_modes,
 705   6, /* memmov_cost  */
 706   2, /* issue_rate  */
 707   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 708   8,    /* function_align.  */
 709   8,    /* jump_align.  */
 710   8,    /* loop_align.  */
 711   2,    /* int_reassoc_width.  */
 712   4,    /* fp_reassoc_width.  */
 713   1,    /* vec_reassoc_width.  */
 714   2,    /* min_div_recip_mul_sf.  */
 715   2,    /* min_div_recip_mul_df.  */
 716   0,    /* max_case_values.  */
 717   0,    /* cache_line_size.  */
 718   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
 720 };
 721
 722 static const struct tune_params xgene1_tunings =
 723 {
 724   &xgene1_extra_costs,
 725   &xgene1_addrcost_table,
 726   &xgene1_regmove_cost,
 727   &xgene1_vector_cost,
 728   &generic_branch_cost,
 729   &xgene1_approx_modes,
 730   6, /* memmov_cost  */
 731   4, /* issue_rate  */
 732   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 733   16,   /* function_align.  */
 734   8,    /* jump_align.  */
 735   16,   /* loop_align.  */
 736   2,    /* int_reassoc_width.  */
 737   4,    /* fp_reassoc_width.  */
 738   1,    /* vec_reassoc_width.  */
 739   2,    /* min_div_recip_mul_sf.  */
 740   2,    /* min_div_recip_mul_df.  */
 741   0,    /* max_case_values.  */
 742   0,    /* cache_line_size.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 745 };
 746
 747 static const struct tune_params qdf24xx_tunings =
 748 {
 749   &qdf24xx_extra_costs,
 750   &qdf24xx_addrcost_table,
 751   &qdf24xx_regmove_cost,
 752   &generic_vector_cost,
 753   &generic_branch_cost,
 754   &generic_approx_modes,
 755   4, /* memmov_cost  */
 756   4, /* issue_rate  */
 757   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 758    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 759   16,   /* function_align.  */
 760   8,    /* jump_align.  */
 761   16,   /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   64,   /* cache_line_size.  */
 769   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 770   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 771 };
 772
 773 static const struct tune_params thunderx2t99_tunings =
 774 {
 775   &thunderx2t99_extra_costs,
 776   &thunderx2t99_addrcost_table,
 777   &thunderx2t99_regmove_cost,
 778   &thunderx2t99_vector_cost,
 779   &thunderx2t99_branch_cost,
 780   &generic_approx_modes,
 781   4, /* memmov_cost.  */
 782   4, /* issue_rate.  */
 783   AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
 784   16,   /* function_align.  */
 785   8,    /* jump_align.  */
 786   16,   /* loop_align.  */
 787   3,    /* int_reassoc_width.  */
 788   2,    /* fp_reassoc_width.  */
 789   2,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   64,   /* cache_line_size.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 796 };
 797
 798 /* Support for fine-grained override of the tuning structures.  */
 799 struct aarch64_tuning_override_function
 800 {
 801   const char* name;
 802   void (*parse_override)(const char*, struct tune_params*);
 803 };
 804
 805 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 806 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 807
 808 static const struct aarch64_tuning_override_function
 809 aarch64_tuning_override_functions[] =
 810 {
 811   { "fuse", aarch64_parse_fuse_string },
 812   { "tune", aarch64_parse_tune_string },
 813   { NULL, NULL }
 814 };
 815
 816 /* A processor implementing AArch64.  */
 817 struct processor
 818 {
 819   const char *const name;
 820   enum aarch64_processor ident;
 821   enum aarch64_processor sched_core;
 822   enum aarch64_arch arch;
 823   unsigned architecture_version;
 824   const unsigned long flags;
 825   const struct tune_params *const tune;
 826 };
 827
 828 /* Architectures implementing AArch64.  */
 829 static const struct processor all_architectures[] =
 830 {
 831 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 832   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 833 #include "aarch64-arches.def"
 834   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 835 };
 836
 837 /* Processor cores implementing AArch64.  */
 838 static const struct processor all_cores[] =
 839 {
 840 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 841   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 842   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 843   FLAGS, &COSTS##_tunings},
 844 #include "aarch64-cores.def"
 845   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 846     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 847   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 848 };
 849
 850
 851 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 852    handling code or by target attributes.  */
 853 static const struct processor *selected_arch;
 854 static const struct processor *selected_cpu;
 855 static const struct processor *selected_tune;
 856
 857 /* The current tuning set.  */
 858 struct tune_params aarch64_tune_params = generic_tunings;
 859
 860 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 861
 862 /* An ISA extension in the co-processor and main instruction set space.  */
 863 struct aarch64_option_extension
 864 {
 865   const char *const name;
 866   const unsigned long flags_on;
 867   const unsigned long flags_off;
 868 };
 869
 870 typedef enum aarch64_cond_code
 871 {
 872   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 873   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 874   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 875 }
 876 aarch64_cc;
 877
 878 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 879
 880 /* The condition codes of the processor, and the inverse function.  */
 881 static const char * const aarch64_condition_codes[] =
 882 {
 883   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 884   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 885 };
 886
 887 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 888 const char *
 889 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 890                         const char * branch_format)
 891 {
 892     rtx_code_label * tmp_label = gen_label_rtx ();
 893     char label_buf[256];
 894     char buffer[128];
 895     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 896                                  CODE_LABEL_NUMBER (tmp_label));
 897     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 898     rtx dest_label = operands[pos_label];
 899     operands[pos_label] = tmp_label;
 900
 901     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 902     output_asm_insn (buffer, operands);
 903
 904     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 905     operands[pos_label] = dest_label;
 906     output_asm_insn (buffer, operands);
 907     return "";
 908 }
 909
 910 void
 911 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 912 {
 913   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 914   if (TARGET_GENERAL_REGS_ONLY)
 915     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 916   else
 917     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 918 }
 919
 920 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 921    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 922    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 923    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 924    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 925    irrespectively of its cost results in bad allocations with many redundant
 926    int<->FP moves which are expensive on various cores.
 927    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 928    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 929    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 930    Otherwise set the allocno class depending on the mode.
 931    The result of this is that it is no longer inefficient to have a higher
 932    memory move cost than the register move cost.
 933 */
 934
 935 static reg_class_t
 936 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 937                                          reg_class_t best_class)
 938 {
 939   enum machine_mode mode;
 940
 941   if (allocno_class != ALL_REGS)
 942     return allocno_class;
 943
 944   if (best_class != ALL_REGS)
 945     return best_class;
 946
 947   mode = PSEUDO_REGNO_MODE (regno);
 948   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 949 }
 950
 951 static unsigned int
 952 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 953 {
 954   if (GET_MODE_UNIT_SIZE (mode) == 4)
 955     return aarch64_tune_params.min_div_recip_mul_sf;
 956   return aarch64_tune_params.min_div_recip_mul_df;
 957 }
 958
 959 static int
 960 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 961                              enum machine_mode mode)
 962 {
 963   if (VECTOR_MODE_P (mode))
 964     return aarch64_tune_params.vec_reassoc_width;
 965   if (INTEGRAL_MODE_P (mode))
 966     return aarch64_tune_params.int_reassoc_width;
 967   if (FLOAT_MODE_P (mode))
 968     return aarch64_tune_params.fp_reassoc_width;
 969   return 1;
 970 }
 971
 972 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 973 unsigned
 974 aarch64_dbx_register_number (unsigned regno)
 975 {
 976    if (GP_REGNUM_P (regno))
 977      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 978    else if (regno == SP_REGNUM)
 979      return AARCH64_DWARF_SP;
 980    else if (FP_REGNUM_P (regno))
 981      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 982
 983    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 984       equivalent DWARF register.  */
 985    return DWARF_FRAME_REGISTERS;
 986 }
 987
 988 /* Return TRUE if MODE is any of the large INT modes.  */
 989 static bool
 990 aarch64_vect_struct_mode_p (machine_mode mode)
 991 {
 992   return mode == OImode || mode == CImode || mode == XImode;
 993 }
 994
 995 /* Return TRUE if MODE is any of the vector modes.  */
 996 static bool
 997 aarch64_vector_mode_p (machine_mode mode)
 998 {
 999   return aarch64_vector_mode_supported_p (mode)
1000          || aarch64_vect_struct_mode_p (mode);
1001 }
1002
1003 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1004 static bool
1005 aarch64_array_mode_supported_p (machine_mode mode,
1006                                 unsigned HOST_WIDE_INT nelems)
1007 {
1008   if (TARGET_SIMD
1009       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1010           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1011       && (nelems >= 2 && nelems <= 4))
1012     return true;
1013
1014   return false;
1015 }
1016
1017 /* Implement HARD_REGNO_NREGS.  */
1018
1019 int
1020 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1021 {
1022   switch (aarch64_regno_regclass (regno))
1023     {
1024     case FP_REGS:
1025     case FP_LO_REGS:
1026       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1027     default:
1028       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1029     }
1030   gcc_unreachable ();
1031 }
1032
1033 /* Implement HARD_REGNO_MODE_OK.  */
1034
1035 int
1036 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1037 {
1038   if (GET_MODE_CLASS (mode) == MODE_CC)
1039     return regno == CC_REGNUM;
1040
1041   if (regno == SP_REGNUM)
1042     /* The purpose of comparing with ptr_mode is to support the
1043        global register variable associated with the stack pointer
1044        register via the syntax of asm ("wsp") in ILP32.  */
1045     return mode == Pmode || mode == ptr_mode;
1046
1047   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1048     return mode == Pmode;
1049
1050   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1051     return 1;
1052
1053   if (FP_REGNUM_P (regno))
1054     {
1055       if (aarch64_vect_struct_mode_p (mode))
1056         return
1057           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1058       else
1059         return 1;
1060     }
1061
1062   return 0;
1063 }
1064
1065 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1066 machine_mode
1067 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1068                                      machine_mode mode)
1069 {
1070   /* Handle modes that fit within single registers.  */
1071   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1072     {
1073       if (GET_MODE_SIZE (mode) >= 4)
1074         return mode;
1075       else
1076         return SImode;
1077     }
1078   /* Fall back to generic for multi-reg and very large modes.  */
1079   else
1080     return choose_hard_reg_mode (regno, nregs, false);
1081 }
1082
1083 /* Return true if calls to DECL should be treated as
1084    long-calls (ie called via a register).  */
1085 static bool
1086 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1087 {
1088   return false;
1089 }
1090
1091 /* Return true if calls to symbol-ref SYM should be treated as
1092    long-calls (ie called via a register).  */
1093 bool
1094 aarch64_is_long_call_p (rtx sym)
1095 {
1096   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1097 }
1098
1099 /* Return true if calls to symbol-ref SYM should not go through
1100    plt stubs.  */
1101
1102 bool
1103 aarch64_is_noplt_call_p (rtx sym)
1104 {
1105   const_tree decl = SYMBOL_REF_DECL (sym);
1106
1107   if (flag_pic
1108       && decl
1109       && (!flag_plt
1110           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1111       && !targetm.binds_local_p (decl))
1112     return true;
1113
1114   return false;
1115 }
1116
1117 /* Return true if the offsets to a zero/sign-extract operation
1118    represent an expression that matches an extend operation.  The
1119    operands represent the paramters from
1120
1121    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1122 bool
1123 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1124                                 rtx extract_imm)
1125 {
1126   HOST_WIDE_INT mult_val, extract_val;
1127
1128   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1129     return false;
1130
1131   mult_val = INTVAL (mult_imm);
1132   extract_val = INTVAL (extract_imm);
1133
1134   if (extract_val > 8
1135       && extract_val < GET_MODE_BITSIZE (mode)
1136       && exact_log2 (extract_val & ~7) > 0
1137       && (extract_val & 7) <= 4
1138       && mult_val == (1 << (extract_val & 7)))
1139     return true;
1140
1141   return false;
1142 }
1143
1144 /* Emit an insn that's a simple single-set.  Both the operands must be
1145    known to be valid.  */
1146 inline static rtx_insn *
1147 emit_set_insn (rtx x, rtx y)
1148 {
1149   return emit_insn (gen_rtx_SET (x, y));
1150 }
1151
1152 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1153    return the rtx for register 0 in the proper mode.  */
1154 rtx
1155 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1156 {
1157   machine_mode mode = SELECT_CC_MODE (code, x, y);
1158   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1159
1160   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1161   return cc_reg;
1162 }
1163
1164 /* Build the SYMBOL_REF for __tls_get_addr.  */
1165
1166 static GTY(()) rtx tls_get_addr_libfunc;
1167
1168 rtx
1169 aarch64_tls_get_addr (void)
1170 {
1171   if (!tls_get_addr_libfunc)
1172     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1173   return tls_get_addr_libfunc;
1174 }
1175
1176 /* Return the TLS model to use for ADDR.  */
1177
1178 static enum tls_model
1179 tls_symbolic_operand_type (rtx addr)
1180 {
1181   enum tls_model tls_kind = TLS_MODEL_NONE;
1182   rtx sym, addend;
1183
1184   if (GET_CODE (addr) == CONST)
1185     {
1186       split_const (addr, &sym, &addend);
1187       if (GET_CODE (sym) == SYMBOL_REF)
1188         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1189     }
1190   else if (GET_CODE (addr) == SYMBOL_REF)
1191     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1192
1193   return tls_kind;
1194 }
1195
1196 /* We'll allow lo_sum's in addresses in our legitimate addresses
1197    so that combine would take care of combining addresses where
1198    necessary, but for generation purposes, we'll generate the address
1199    as :
1200    RTL                               Absolute
1201    tmp = hi (symbol_ref);            adrp  x1, foo
1202    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1203                                      nop
1204
1205    PIC                               TLS
1206    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1207    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1208                                      bl   __tls_get_addr
1209                                      nop
1210
1211    Load TLS symbol, depending on TLS mechanism and TLS access model.
1212
1213    Global Dynamic - Traditional TLS:
1214    adrp tmp, :tlsgd:imm
1215    add  dest, tmp, #:tlsgd_lo12:imm
1216    bl   __tls_get_addr
1217
1218    Global Dynamic - TLS Descriptors:
1219    adrp dest, :tlsdesc:imm
1220    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1221    add  dest, dest, #:tlsdesc_lo12:imm
1222    blr  tmp
1223    mrs  tp, tpidr_el0
1224    add  dest, dest, tp
1225
1226    Initial Exec:
1227    mrs  tp, tpidr_el0
1228    adrp tmp, :gottprel:imm
1229    ldr  dest, [tmp, #:gottprel_lo12:imm]
1230    add  dest, dest, tp
1231
1232    Local Exec:
1233    mrs  tp, tpidr_el0
1234    add  t0, tp, #:tprel_hi12:imm, lsl #12
1235    add  t0, t0, #:tprel_lo12_nc:imm
1236 */
1237
1238 static void
1239 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1240                                    enum aarch64_symbol_type type)
1241 {
1242   switch (type)
1243     {
1244     case SYMBOL_SMALL_ABSOLUTE:
1245       {
1246         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1247         rtx tmp_reg = dest;
1248         machine_mode mode = GET_MODE (dest);
1249
1250         gcc_assert (mode == Pmode || mode == ptr_mode);
1251
1252         if (can_create_pseudo_p ())
1253           tmp_reg = gen_reg_rtx (mode);
1254
1255         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1256         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1257         return;
1258       }
1259
1260     case SYMBOL_TINY_ABSOLUTE:
1261       emit_insn (gen_rtx_SET (dest, imm));
1262       return;
1263
1264     case SYMBOL_SMALL_GOT_28K:
1265       {
1266         machine_mode mode = GET_MODE (dest);
1267         rtx gp_rtx = pic_offset_table_rtx;
1268         rtx insn;
1269         rtx mem;
1270
1271         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1272            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1273            decide rtx costs, in which case pic_offset_table_rtx is not
1274            initialized.  For that case no need to generate the first adrp
1275            instruction as the final cost for global variable access is
1276            one instruction.  */
1277         if (gp_rtx != NULL)
1278           {
1279             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1280                using the page base as GOT base, the first page may be wasted,
1281                in the worst scenario, there is only 28K space for GOT).
1282
1283                The generate instruction sequence for accessing global variable
1284                is:
1285
1286                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1287
1288                Only one instruction needed. But we must initialize
1289                pic_offset_table_rtx properly.  We generate initialize insn for
1290                every global access, and allow CSE to remove all redundant.
1291
1292                The final instruction sequences will look like the following
1293                for multiply global variables access.
1294
1295                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1296
1297                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1298                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1299                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1300                  ...  */
1301
1302             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1303             crtl->uses_pic_offset_table = 1;
1304             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1305
1306             if (mode != GET_MODE (gp_rtx))
1307              gp_rtx = gen_lowpart (mode, gp_rtx);
1308
1309           }
1310
1311         if (mode == ptr_mode)
1312           {
1313             if (mode == DImode)
1314               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1315             else
1316               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1317
1318             mem = XVECEXP (SET_SRC (insn), 0, 0);
1319           }
1320         else
1321           {
1322             gcc_assert (mode == Pmode);
1323
1324             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1325             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1326           }
1327
1328         /* The operand is expected to be MEM.  Whenever the related insn
1329            pattern changed, above code which calculate mem should be
1330            updated.  */
1331         gcc_assert (GET_CODE (mem) == MEM);
1332         MEM_READONLY_P (mem) = 1;
1333         MEM_NOTRAP_P (mem) = 1;
1334         emit_insn (insn);
1335         return;
1336       }
1337
1338     case SYMBOL_SMALL_GOT_4G:
1339       {
1340         /* In ILP32, the mode of dest can be either SImode or DImode,
1341            while the got entry is always of SImode size.  The mode of
1342            dest depends on how dest is used: if dest is assigned to a
1343            pointer (e.g. in the memory), it has SImode; it may have
1344            DImode if dest is dereferenced to access the memeory.
1345            This is why we have to handle three different ldr_got_small
1346            patterns here (two patterns for ILP32).  */
1347
1348         rtx insn;
1349         rtx mem;
1350         rtx tmp_reg = dest;
1351         machine_mode mode = GET_MODE (dest);
1352
1353         if (can_create_pseudo_p ())
1354           tmp_reg = gen_reg_rtx (mode);
1355
1356         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1357         if (mode == ptr_mode)
1358           {
1359             if (mode == DImode)
1360               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1361             else
1362               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1363
1364             mem = XVECEXP (SET_SRC (insn), 0, 0);
1365           }
1366         else
1367           {
1368             gcc_assert (mode == Pmode);
1369
1370             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1371             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1372           }
1373
1374         gcc_assert (GET_CODE (mem) == MEM);
1375         MEM_READONLY_P (mem) = 1;
1376         MEM_NOTRAP_P (mem) = 1;
1377         emit_insn (insn);
1378         return;
1379       }
1380
1381     case SYMBOL_SMALL_TLSGD:
1382       {
1383         rtx_insn *insns;
1384         machine_mode mode = GET_MODE (dest);
1385         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1386
1387         start_sequence ();
1388         if (TARGET_ILP32)
1389           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1390         else
1391           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1392         insns = get_insns ();
1393         end_sequence ();
1394
1395         RTL_CONST_CALL_P (insns) = 1;
1396         emit_libcall_block (insns, dest, result, imm);
1397         return;
1398       }
1399
1400     case SYMBOL_SMALL_TLSDESC:
1401       {
1402         machine_mode mode = GET_MODE (dest);
1403         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1404         rtx tp;
1405
1406         gcc_assert (mode == Pmode || mode == ptr_mode);
1407
1408         /* In ILP32, the got entry is always of SImode size.  Unlike
1409            small GOT, the dest is fixed at reg 0.  */
1410         if (TARGET_ILP32)
1411           emit_insn (gen_tlsdesc_small_si (imm));
1412         else
1413           emit_insn (gen_tlsdesc_small_di (imm));
1414         tp = aarch64_load_tp (NULL);
1415
1416         if (mode != Pmode)
1417           tp = gen_lowpart (mode, tp);
1418
1419         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1420         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1421         return;
1422       }
1423
1424     case SYMBOL_SMALL_TLSIE:
1425       {
1426         /* In ILP32, the mode of dest can be either SImode or DImode,
1427            while the got entry is always of SImode size.  The mode of
1428            dest depends on how dest is used: if dest is assigned to a
1429            pointer (e.g. in the memory), it has SImode; it may have
1430            DImode if dest is dereferenced to access the memeory.
1431            This is why we have to handle three different tlsie_small
1432            patterns here (two patterns for ILP32).  */
1433         machine_mode mode = GET_MODE (dest);
1434         rtx tmp_reg = gen_reg_rtx (mode);
1435         rtx tp = aarch64_load_tp (NULL);
1436
1437         if (mode == ptr_mode)
1438           {
1439             if (mode == DImode)
1440               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1441             else
1442               {
1443                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1444                 tp = gen_lowpart (mode, tp);
1445               }
1446           }
1447         else
1448           {
1449             gcc_assert (mode == Pmode);
1450             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1451           }
1452
1453         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1454         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1455         return;
1456       }
1457
1458     case SYMBOL_TLSLE12:
1459     case SYMBOL_TLSLE24:
1460     case SYMBOL_TLSLE32:
1461     case SYMBOL_TLSLE48:
1462       {
1463         machine_mode mode = GET_MODE (dest);
1464         rtx tp = aarch64_load_tp (NULL);
1465
1466         if (mode != Pmode)
1467           tp = gen_lowpart (mode, tp);
1468
1469         switch (type)
1470           {
1471           case SYMBOL_TLSLE12:
1472             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1473                         (dest, tp, imm));
1474             break;
1475           case SYMBOL_TLSLE24:
1476             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1477                         (dest, tp, imm));
1478           break;
1479           case SYMBOL_TLSLE32:
1480             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1481                         (dest, imm));
1482             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1483                         (dest, dest, tp));
1484           break;
1485           case SYMBOL_TLSLE48:
1486             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1487                         (dest, imm));
1488             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1489                         (dest, dest, tp));
1490             break;
1491           default:
1492             gcc_unreachable ();
1493           }
1494
1495         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1496         return;
1497       }
1498
1499     case SYMBOL_TINY_GOT:
1500       emit_insn (gen_ldr_got_tiny (dest, imm));
1501       return;
1502
1503     case SYMBOL_TINY_TLSIE:
1504       {
1505         machine_mode mode = GET_MODE (dest);
1506         rtx tp = aarch64_load_tp (NULL);
1507
1508         if (mode == ptr_mode)
1509           {
1510             if (mode == DImode)
1511               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1512             else
1513               {
1514                 tp = gen_lowpart (mode, tp);
1515                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1516               }
1517           }
1518         else
1519           {
1520             gcc_assert (mode == Pmode);
1521             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1522           }
1523
1524         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1525         return;
1526       }
1527
1528     default:
1529       gcc_unreachable ();
1530     }
1531 }
1532
1533 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1534    handle all moves if !can_create_pseudo_p ().  The distinction is
1535    important because, unlike emit_move_insn, the move expanders know
1536    how to force Pmode objects into the constant pool even when the
1537    constant pool address is not itself legitimate.  */
1538 static rtx
1539 aarch64_emit_move (rtx dest, rtx src)
1540 {
1541   return (can_create_pseudo_p ()
1542           ? emit_move_insn (dest, src)
1543           : emit_move_insn_1 (dest, src));
1544 }
1545
1546 /* Split a 128-bit move operation into two 64-bit move operations,
1547    taking care to handle partial overlap of register to register
1548    copies.  Special cases are needed when moving between GP regs and
1549    FP regs.  SRC can be a register, constant or memory; DST a register
1550    or memory.  If either operand is memory it must not have any side
1551    effects.  */
1552 void
1553 aarch64_split_128bit_move (rtx dst, rtx src)
1554 {
1555   rtx dst_lo, dst_hi;
1556   rtx src_lo, src_hi;
1557
1558   machine_mode mode = GET_MODE (dst);
1559
1560   gcc_assert (mode == TImode || mode == TFmode);
1561   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1562   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1563
1564   if (REG_P (dst) && REG_P (src))
1565     {
1566       int src_regno = REGNO (src);
1567       int dst_regno = REGNO (dst);
1568
1569       /* Handle FP <-> GP regs.  */
1570       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1571         {
1572           src_lo = gen_lowpart (word_mode, src);
1573           src_hi = gen_highpart (word_mode, src);
1574
1575           if (mode == TImode)
1576             {
1577               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1578               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1579             }
1580           else
1581             {
1582               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1583               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1584             }
1585           return;
1586         }
1587       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1588         {
1589           dst_lo = gen_lowpart (word_mode, dst);
1590           dst_hi = gen_highpart (word_mode, dst);
1591
1592           if (mode == TImode)
1593             {
1594               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1595               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1596             }
1597           else
1598             {
1599               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1600               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1601             }
1602           return;
1603         }
1604     }
1605
1606   dst_lo = gen_lowpart (word_mode, dst);
1607   dst_hi = gen_highpart (word_mode, dst);
1608   src_lo = gen_lowpart (word_mode, src);
1609   src_hi = gen_highpart_mode (word_mode, mode, src);
1610
1611   /* At most one pairing may overlap.  */
1612   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1613     {
1614       aarch64_emit_move (dst_hi, src_hi);
1615       aarch64_emit_move (dst_lo, src_lo);
1616     }
1617   else
1618     {
1619       aarch64_emit_move (dst_lo, src_lo);
1620       aarch64_emit_move (dst_hi, src_hi);
1621     }
1622 }
1623
1624 bool
1625 aarch64_split_128bit_move_p (rtx dst, rtx src)
1626 {
1627   return (! REG_P (src)
1628           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1629 }
1630
1631 /* Split a complex SIMD combine.  */
1632
1633 void
1634 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1635 {
1636   machine_mode src_mode = GET_MODE (src1);
1637   machine_mode dst_mode = GET_MODE (dst);
1638
1639   gcc_assert (VECTOR_MODE_P (dst_mode));
1640
1641   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1642     {
1643       rtx (*gen) (rtx, rtx, rtx);
1644
1645       switch (src_mode)
1646         {
1647         case V8QImode:
1648           gen = gen_aarch64_simd_combinev8qi;
1649           break;
1650         case V4HImode:
1651           gen = gen_aarch64_simd_combinev4hi;
1652           break;
1653         case V2SImode:
1654           gen = gen_aarch64_simd_combinev2si;
1655           break;
1656         case V4HFmode:
1657           gen = gen_aarch64_simd_combinev4hf;
1658           break;
1659         case V2SFmode:
1660           gen = gen_aarch64_simd_combinev2sf;
1661           break;
1662         case DImode:
1663           gen = gen_aarch64_simd_combinedi;
1664           break;
1665         case DFmode:
1666           gen = gen_aarch64_simd_combinedf;
1667           break;
1668         default:
1669           gcc_unreachable ();
1670         }
1671
1672       emit_insn (gen (dst, src1, src2));
1673       return;
1674     }
1675 }
1676
1677 /* Split a complex SIMD move.  */
1678
1679 void
1680 aarch64_split_simd_move (rtx dst, rtx src)
1681 {
1682   machine_mode src_mode = GET_MODE (src);
1683   machine_mode dst_mode = GET_MODE (dst);
1684
1685   gcc_assert (VECTOR_MODE_P (dst_mode));
1686
1687   if (REG_P (dst) && REG_P (src))
1688     {
1689       rtx (*gen) (rtx, rtx);
1690
1691       gcc_assert (VECTOR_MODE_P (src_mode));
1692
1693       switch (src_mode)
1694         {
1695         case V16QImode:
1696           gen = gen_aarch64_split_simd_movv16qi;
1697           break;
1698         case V8HImode:
1699           gen = gen_aarch64_split_simd_movv8hi;
1700           break;
1701         case V4SImode:
1702           gen = gen_aarch64_split_simd_movv4si;
1703           break;
1704         case V2DImode:
1705           gen = gen_aarch64_split_simd_movv2di;
1706           break;
1707         case V8HFmode:
1708           gen = gen_aarch64_split_simd_movv8hf;
1709           break;
1710         case V4SFmode:
1711           gen = gen_aarch64_split_simd_movv4sf;
1712           break;
1713         case V2DFmode:
1714           gen = gen_aarch64_split_simd_movv2df;
1715           break;
1716         default:
1717           gcc_unreachable ();
1718         }
1719
1720       emit_insn (gen (dst, src));
1721       return;
1722     }
1723 }
1724
1725 bool
1726 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1727                               machine_mode ymode, rtx y)
1728 {
1729   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1730   gcc_assert (r != NULL);
1731   return rtx_equal_p (x, r);
1732 }
1733
1734
1735 static rtx
1736 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1737 {
1738   if (can_create_pseudo_p ())
1739     return force_reg (mode, value);
1740   else
1741     {
1742       x = aarch64_emit_move (x, value);
1743       return x;
1744     }
1745 }
1746
1747
1748 static rtx
1749 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1750 {
1751   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1752     {
1753       rtx high;
1754       /* Load the full offset into a register.  This
1755          might be improvable in the future.  */
1756       high = GEN_INT (offset);
1757       offset = 0;
1758       high = aarch64_force_temporary (mode, temp, high);
1759       reg = aarch64_force_temporary (mode, temp,
1760                                      gen_rtx_PLUS (mode, high, reg));
1761     }
1762   return plus_constant (mode, reg, offset);
1763 }
1764
1765 static int
1766 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1767                                 machine_mode mode)
1768 {
1769   int i;
1770   unsigned HOST_WIDE_INT val, val2, mask;
1771   int one_match, zero_match;
1772   int num_insns;
1773
1774   val = INTVAL (imm);
1775
1776   if (aarch64_move_imm (val, mode))
1777     {
1778       if (generate)
1779         emit_insn (gen_rtx_SET (dest, imm));
1780       return 1;
1781     }
1782
1783   if ((val >> 32) == 0 || mode == SImode)
1784     {
1785       if (generate)
1786         {
1787           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1788           if (mode == SImode)
1789             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1790                                        GEN_INT ((val >> 16) & 0xffff)));
1791           else
1792             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1793                                        GEN_INT ((val >> 16) & 0xffff)));
1794         }
1795       return 2;
1796     }
1797
1798   /* Remaining cases are all for DImode.  */
1799
1800   mask = 0xffff;
1801   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1802     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1803   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1804     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1805
1806   if (zero_match != 2 && one_match != 2)
1807     {
1808       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1809          For a 64-bit bitmask try whether changing 16 bits to all ones or
1810          zeroes creates a valid bitmask.  To check any repeated bitmask,
1811          try using 16 bits from the other 32-bit half of val.  */
1812
1813       for (i = 0; i < 64; i += 16, mask <<= 16)
1814         {
1815           val2 = val & ~mask;
1816           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1817             break;
1818           val2 = val | mask;
1819           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1820             break;
1821           val2 = val2 & ~mask;
1822           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1823           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1824             break;
1825         }
1826       if (i != 64)
1827         {
1828           if (generate)
1829             {
1830               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1831               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1832                                          GEN_INT ((val >> i) & 0xffff)));
1833             }
1834           return 2;
1835         }
1836     }
1837
1838   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1839      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1840      otherwise skip zero bits.  */
1841
1842   num_insns = 1;
1843   mask = 0xffff;
1844   val2 = one_match > zero_match ? ~val : val;
1845   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1846
1847   if (generate)
1848     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1849                                            ? (val | ~(mask << i))
1850                                            : (val & (mask << i)))));
1851   for (i += 16; i < 64; i += 16)
1852     {
1853       if ((val2 & (mask << i)) == 0)
1854         continue;
1855       if (generate)
1856         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1857                                    GEN_INT ((val >> i) & 0xffff)));
1858       num_insns ++;
1859     }
1860
1861   return num_insns;
1862 }
1863
1864
1865 void
1866 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1867 {
1868   machine_mode mode = GET_MODE (dest);
1869
1870   gcc_assert (mode == SImode || mode == DImode);
1871
1872   /* Check on what type of symbol it is.  */
1873   if (GET_CODE (imm) == SYMBOL_REF
1874       || GET_CODE (imm) == LABEL_REF
1875       || GET_CODE (imm) == CONST)
1876     {
1877       rtx mem, base, offset;
1878       enum aarch64_symbol_type sty;
1879
1880       /* If we have (const (plus symbol offset)), separate out the offset
1881          before we start classifying the symbol.  */
1882       split_const (imm, &base, &offset);
1883
1884       sty = aarch64_classify_symbol (base, offset);
1885       switch (sty)
1886         {
1887         case SYMBOL_FORCE_TO_MEM:
1888           if (offset != const0_rtx
1889               && targetm.cannot_force_const_mem (mode, imm))
1890             {
1891               gcc_assert (can_create_pseudo_p ());
1892               base = aarch64_force_temporary (mode, dest, base);
1893               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1894               aarch64_emit_move (dest, base);
1895               return;
1896             }
1897
1898           mem = force_const_mem (ptr_mode, imm);
1899           gcc_assert (mem);
1900
1901           /* If we aren't generating PC relative literals, then
1902              we need to expand the literal pool access carefully.
1903              This is something that needs to be done in a number
1904              of places, so could well live as a separate function.  */
1905           if (!aarch64_pcrelative_literal_loads)
1906             {
1907               gcc_assert (can_create_pseudo_p ());
1908               base = gen_reg_rtx (ptr_mode);
1909               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1910               mem = gen_rtx_MEM (ptr_mode, base);
1911             }
1912
1913           if (mode != ptr_mode)
1914             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1915
1916           emit_insn (gen_rtx_SET (dest, mem));
1917
1918           return;
1919
1920         case SYMBOL_SMALL_TLSGD:
1921         case SYMBOL_SMALL_TLSDESC:
1922         case SYMBOL_SMALL_TLSIE:
1923         case SYMBOL_SMALL_GOT_28K:
1924         case SYMBOL_SMALL_GOT_4G:
1925         case SYMBOL_TINY_GOT:
1926         case SYMBOL_TINY_TLSIE:
1927           if (offset != const0_rtx)
1928             {
1929               gcc_assert(can_create_pseudo_p ());
1930               base = aarch64_force_temporary (mode, dest, base);
1931               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1932               aarch64_emit_move (dest, base);
1933               return;
1934             }
1935           /* FALLTHRU */
1936
1937         case SYMBOL_SMALL_ABSOLUTE:
1938         case SYMBOL_TINY_ABSOLUTE:
1939         case SYMBOL_TLSLE12:
1940         case SYMBOL_TLSLE24:
1941         case SYMBOL_TLSLE32:
1942         case SYMBOL_TLSLE48:
1943           aarch64_load_symref_appropriately (dest, imm, sty);
1944           return;
1945
1946         default:
1947           gcc_unreachable ();
1948         }
1949     }
1950
1951   if (!CONST_INT_P (imm))
1952     {
1953       if (GET_CODE (imm) == HIGH)
1954         emit_insn (gen_rtx_SET (dest, imm));
1955       else
1956         {
1957           rtx mem = force_const_mem (mode, imm);
1958           gcc_assert (mem);
1959           emit_insn (gen_rtx_SET (dest, mem));
1960         }
1961
1962       return;
1963     }
1964
1965   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1966 }
1967
1968 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1969    temporary value if necessary.  FRAME_RELATED_P should be true if
1970    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1971    to the generated instructions.  If SCRATCHREG is known to hold
1972    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1973    immediate again.
1974
1975    Since this function may be used to adjust the stack pointer, we must
1976    ensure that it cannot cause transient stack deallocation (for example
1977    by first incrementing SP and then decrementing when adjusting by a
1978    large immediate).  */
1979
1980 static void
1981 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1982                                HOST_WIDE_INT delta, bool frame_related_p,
1983                                bool emit_move_imm)
1984 {
1985   HOST_WIDE_INT mdelta = abs_hwi (delta);
1986   rtx this_rtx = gen_rtx_REG (mode, regnum);
1987   rtx_insn *insn;
1988
1989   if (!mdelta)
1990     return;
1991
1992   /* Single instruction adjustment.  */
1993   if (aarch64_uimm12_shift (mdelta))
1994     {
1995       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1996       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1997       return;
1998     }
1999
2000   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2001      Only do this if mdelta is not a 16-bit move as adjusting using a move
2002      is better.  */
2003   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2004     {
2005       HOST_WIDE_INT low_off = mdelta & 0xfff;
2006
2007       low_off = delta < 0 ? -low_off : low_off;
2008       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2009       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2010       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2011       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2012       return;
2013     }
2014
2015   /* Emit a move immediate if required and an addition/subtraction.  */
2016   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2017   if (emit_move_imm)
2018     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2019   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2020                               : gen_add2_insn (this_rtx, scratch_rtx));
2021   if (frame_related_p)
2022     {
2023       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024       rtx adj = plus_constant (mode, this_rtx, delta);
2025       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2026     }
2027 }
2028
2029 static inline void
2030 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2031                       HOST_WIDE_INT delta)
2032 {
2033   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2034 }
2035
2036 static inline void
2037 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2038 {
2039   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2040                                  true, emit_move_imm);
2041 }
2042
2043 static inline void
2044 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2045 {
2046   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2047                                  frame_related_p, true);
2048 }
2049
2050 static bool
2051 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2052                                  tree exp ATTRIBUTE_UNUSED)
2053 {
2054   /* Currently, always true.  */
2055   return true;
2056 }
2057
2058 /* Implement TARGET_PASS_BY_REFERENCE.  */
2059
2060 static bool
2061 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2062                            machine_mode mode,
2063                            const_tree type,
2064                            bool named ATTRIBUTE_UNUSED)
2065 {
2066   HOST_WIDE_INT size;
2067   machine_mode dummymode;
2068   int nregs;
2069
2070   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2071   size = (mode == BLKmode && type)
2072     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2073
2074   /* Aggregates are passed by reference based on their size.  */
2075   if (type && AGGREGATE_TYPE_P (type))
2076     {
2077       size = int_size_in_bytes (type);
2078     }
2079
2080   /* Variable sized arguments are always returned by reference.  */
2081   if (size < 0)
2082     return true;
2083
2084   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2085   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2086                                                &dummymode, &nregs,
2087                                                NULL))
2088     return false;
2089
2090   /* Arguments which are variable sized or larger than 2 registers are
2091      passed by reference unless they are a homogenous floating point
2092      aggregate.  */
2093   return size > 2 * UNITS_PER_WORD;
2094 }
2095
2096 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2097 static bool
2098 aarch64_return_in_msb (const_tree valtype)
2099 {
2100   machine_mode dummy_mode;
2101   int dummy_int;
2102
2103   /* Never happens in little-endian mode.  */
2104   if (!BYTES_BIG_ENDIAN)
2105     return false;
2106
2107   /* Only composite types smaller than or equal to 16 bytes can
2108      be potentially returned in registers.  */
2109   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2110       || int_size_in_bytes (valtype) <= 0
2111       || int_size_in_bytes (valtype) > 16)
2112     return false;
2113
2114   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2115      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2116      is always passed/returned in the least significant bits of fp/simd
2117      register(s).  */
2118   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2119                                                &dummy_mode, &dummy_int, NULL))
2120     return false;
2121
2122   return true;
2123 }
2124
2125 /* Implement TARGET_FUNCTION_VALUE.
2126    Define how to find the value returned by a function.  */
2127
2128 static rtx
2129 aarch64_function_value (const_tree type, const_tree func,
2130                         bool outgoing ATTRIBUTE_UNUSED)
2131 {
2132   machine_mode mode;
2133   int unsignedp;
2134   int count;
2135   machine_mode ag_mode;
2136
2137   mode = TYPE_MODE (type);
2138   if (INTEGRAL_TYPE_P (type))
2139     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2140
2141   if (aarch64_return_in_msb (type))
2142     {
2143       HOST_WIDE_INT size = int_size_in_bytes (type);
2144
2145       if (size % UNITS_PER_WORD != 0)
2146         {
2147           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2148           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2149         }
2150     }
2151
2152   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2153                                                &ag_mode, &count, NULL))
2154     {
2155       if (!aarch64_composite_type_p (type, mode))
2156         {
2157           gcc_assert (count == 1 && mode == ag_mode);
2158           return gen_rtx_REG (mode, V0_REGNUM);
2159         }
2160       else
2161         {
2162           int i;
2163           rtx par;
2164
2165           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2166           for (i = 0; i < count; i++)
2167             {
2168               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2169               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2170                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2171               XVECEXP (par, 0, i) = tmp;
2172             }
2173           return par;
2174         }
2175     }
2176   else
2177     return gen_rtx_REG (mode, R0_REGNUM);
2178 }
2179
2180 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2181    Return true if REGNO is the number of a hard register in which the values
2182    of called function may come back.  */
2183
2184 static bool
2185 aarch64_function_value_regno_p (const unsigned int regno)
2186 {
2187   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2188      of 16-byte return values are: 128-bit integers and 16-byte small
2189      structures (excluding homogeneous floating-point aggregates).  */
2190   if (regno == R0_REGNUM || regno == R1_REGNUM)
2191     return true;
2192
2193   /* Up to four fp/simd registers can return a function value, e.g. a
2194      homogeneous floating-point aggregate having four members.  */
2195   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2196     return TARGET_FLOAT;
2197
2198   return false;
2199 }
2200
2201 /* Implement TARGET_RETURN_IN_MEMORY.
2202
2203    If the type T of the result of a function is such that
2204      void func (T arg)
2205    would require that arg be passed as a value in a register (or set of
2206    registers) according to the parameter passing rules, then the result
2207    is returned in the same registers as would be used for such an
2208    argument.  */
2209
2210 static bool
2211 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2212 {
2213   HOST_WIDE_INT size;
2214   machine_mode ag_mode;
2215   int count;
2216
2217   if (!AGGREGATE_TYPE_P (type)
2218       && TREE_CODE (type) != COMPLEX_TYPE
2219       && TREE_CODE (type) != VECTOR_TYPE)
2220     /* Simple scalar types always returned in registers.  */
2221     return false;
2222
2223   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2224                                                type,
2225                                                &ag_mode,
2226                                                &count,
2227                                                NULL))
2228     return false;
2229
2230   /* Types larger than 2 registers returned in memory.  */
2231   size = int_size_in_bytes (type);
2232   return (size < 0 || size > 2 * UNITS_PER_WORD);
2233 }
2234
2235 static bool
2236 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2237                                const_tree type, int *nregs)
2238 {
2239   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2240   return aarch64_vfp_is_call_or_return_candidate (mode,
2241                                                   type,
2242                                                   &pcum->aapcs_vfp_rmode,
2243                                                   nregs,
2244                                                   NULL);
2245 }
2246
2247 /* Given MODE and TYPE of a function argument, return the alignment in
2248    bits.  The idea is to suppress any stronger alignment requested by
2249    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2250    This is a helper function for local use only.  */
2251
2252 static unsigned int
2253 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2254 {
2255   if (!type)
2256     return GET_MODE_ALIGNMENT (mode);
2257   if (integer_zerop (TYPE_SIZE (type)))
2258     return 0;
2259
2260   gcc_assert (TYPE_MODE (type) == mode);
2261
2262   if (!AGGREGATE_TYPE_P (type))
2263     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2264
2265   if (TREE_CODE (type) == ARRAY_TYPE)
2266     return TYPE_ALIGN (TREE_TYPE (type));
2267
2268   unsigned int alignment = 0;
2269
2270   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2271     alignment = std::max (alignment, DECL_ALIGN (field));
2272
2273   return alignment;
2274 }
2275
2276 /* Layout a function argument according to the AAPCS64 rules.  The rule
2277    numbers refer to the rule numbers in the AAPCS64.  */
2278
2279 static void
2280 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2281                     const_tree type,
2282                     bool named ATTRIBUTE_UNUSED)
2283 {
2284   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2285   int ncrn, nvrn, nregs;
2286   bool allocate_ncrn, allocate_nvrn;
2287   HOST_WIDE_INT size;
2288
2289   /* We need to do this once per argument.  */
2290   if (pcum->aapcs_arg_processed)
2291     return;
2292
2293   pcum->aapcs_arg_processed = true;
2294
2295   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2296   size
2297     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2298                 UNITS_PER_WORD);
2299
2300   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2301   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2302                                                  mode,
2303                                                  type,
2304                                                  &nregs);
2305
2306   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2307      The following code thus handles passing by SIMD/FP registers first.  */
2308
2309   nvrn = pcum->aapcs_nvrn;
2310
2311   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2312      and homogenous short-vector aggregates (HVA).  */
2313   if (allocate_nvrn)
2314     {
2315       if (!TARGET_FLOAT)
2316         aarch64_err_no_fpadvsimd (mode, "argument");
2317
2318       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2319         {
2320           pcum->aapcs_nextnvrn = nvrn + nregs;
2321           if (!aarch64_composite_type_p (type, mode))
2322             {
2323               gcc_assert (nregs == 1);
2324               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2325             }
2326           else
2327             {
2328               rtx par;
2329               int i;
2330               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2331               for (i = 0; i < nregs; i++)
2332                 {
2333                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2334                                          V0_REGNUM + nvrn + i);
2335                   tmp = gen_rtx_EXPR_LIST
2336                     (VOIDmode, tmp,
2337                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2338                   XVECEXP (par, 0, i) = tmp;
2339                 }
2340               pcum->aapcs_reg = par;
2341             }
2342           return;
2343         }
2344       else
2345         {
2346           /* C.3 NSRN is set to 8.  */
2347           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2348           goto on_stack;
2349         }
2350     }
2351
2352   ncrn = pcum->aapcs_ncrn;
2353   nregs = size / UNITS_PER_WORD;
2354
2355   /* C6 - C9.  though the sign and zero extension semantics are
2356      handled elsewhere.  This is the case where the argument fits
2357      entirely general registers.  */
2358   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2359     {
2360       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2361
2362       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2363
2364       /* C.8 if the argument has an alignment of 16 then the NGRN is
2365          rounded up to the next even number.  */
2366       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2367         {
2368           ++ncrn;
2369           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2370         }
2371       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2372          A reg is still generated for it, but the caller should be smart
2373          enough not to use it.  */
2374       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2375         {
2376           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2377         }
2378       else
2379         {
2380           rtx par;
2381           int i;
2382
2383           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2384           for (i = 0; i < nregs; i++)
2385             {
2386               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2387               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2388                                        GEN_INT (i * UNITS_PER_WORD));
2389               XVECEXP (par, 0, i) = tmp;
2390             }
2391           pcum->aapcs_reg = par;
2392         }
2393
2394       pcum->aapcs_nextncrn = ncrn + nregs;
2395       return;
2396     }
2397
2398   /* C.11  */
2399   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2400
2401   /* The argument is passed on stack; record the needed number of words for
2402      this argument and align the total size if necessary.  */
2403 on_stack:
2404   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2405   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2406     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2407                                        16 / UNITS_PER_WORD);
2408   return;
2409 }
2410
2411 /* Implement TARGET_FUNCTION_ARG.  */
2412
2413 static rtx
2414 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2415                       const_tree type, bool named)
2416 {
2417   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2418   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2419
2420   if (mode == VOIDmode)
2421     return NULL_RTX;
2422
2423   aarch64_layout_arg (pcum_v, mode, type, named);
2424   return pcum->aapcs_reg;
2425 }
2426
2427 void
2428 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2429                            const_tree fntype ATTRIBUTE_UNUSED,
2430                            rtx libname ATTRIBUTE_UNUSED,
2431                            const_tree fndecl ATTRIBUTE_UNUSED,
2432                            unsigned n_named ATTRIBUTE_UNUSED)
2433 {
2434   pcum->aapcs_ncrn = 0;
2435   pcum->aapcs_nvrn = 0;
2436   pcum->aapcs_nextncrn = 0;
2437   pcum->aapcs_nextnvrn = 0;
2438   pcum->pcs_variant = ARM_PCS_AAPCS64;
2439   pcum->aapcs_reg = NULL_RTX;
2440   pcum->aapcs_arg_processed = false;
2441   pcum->aapcs_stack_words = 0;
2442   pcum->aapcs_stack_size = 0;
2443
2444   if (!TARGET_FLOAT
2445       && fndecl && TREE_PUBLIC (fndecl)
2446       && fntype && fntype != error_mark_node)
2447     {
2448       const_tree type = TREE_TYPE (fntype);
2449       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2450       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2451       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2452                                                    &mode, &nregs, NULL))
2453         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2454     }
2455   return;
2456 }
2457
2458 static void
2459 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2460                               machine_mode mode,
2461                               const_tree type,
2462                               bool named)
2463 {
2464   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2465   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2466     {
2467       aarch64_layout_arg (pcum_v, mode, type, named);
2468       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2469                   != (pcum->aapcs_stack_words != 0));
2470       pcum->aapcs_arg_processed = false;
2471       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2472       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2473       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2474       pcum->aapcs_stack_words = 0;
2475       pcum->aapcs_reg = NULL_RTX;
2476     }
2477 }
2478
2479 bool
2480 aarch64_function_arg_regno_p (unsigned regno)
2481 {
2482   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2483           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2484 }
2485
2486 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2487    PARM_BOUNDARY bits of alignment, but will be given anything up
2488    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2489    that both before and after the layout of each argument, the Next
2490    Stacked Argument Address (NSAA) will have a minimum alignment of
2491    8 bytes.  */
2492
2493 static unsigned int
2494 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2495 {
2496   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2497
2498   if (alignment < PARM_BOUNDARY)
2499     alignment = PARM_BOUNDARY;
2500   if (alignment > STACK_BOUNDARY)
2501     alignment = STACK_BOUNDARY;
2502   return alignment;
2503 }
2504
2505 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2506
2507    Return true if an argument passed on the stack should be padded upwards,
2508    i.e. if the least-significant byte of the stack slot has useful data.
2509
2510    Small aggregate types are placed in the lowest memory address.
2511
2512    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2513
2514 bool
2515 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2516 {
2517   /* On little-endian targets, the least significant byte of every stack
2518      argument is passed at the lowest byte address of the stack slot.  */
2519   if (!BYTES_BIG_ENDIAN)
2520     return true;
2521
2522   /* Otherwise, integral, floating-point and pointer types are padded downward:
2523      the least significant byte of a stack argument is passed at the highest
2524      byte address of the stack slot.  */
2525   if (type
2526       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2527          || POINTER_TYPE_P (type))
2528       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2529     return false;
2530
2531   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2532   return true;
2533 }
2534
2535 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2536
2537    It specifies padding for the last (may also be the only)
2538    element of a block move between registers and memory.  If
2539    assuming the block is in the memory, padding upward means that
2540    the last element is padded after its highest significant byte,
2541    while in downward padding, the last element is padded at the
2542    its least significant byte side.
2543
2544    Small aggregates and small complex types are always padded
2545    upwards.
2546
2547    We don't need to worry about homogeneous floating-point or
2548    short-vector aggregates; their move is not affected by the
2549    padding direction determined here.  Regardless of endianness,
2550    each element of such an aggregate is put in the least
2551    significant bits of a fp/simd register.
2552
2553    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2554    register has useful data, and return the opposite if the most
2555    significant byte does.  */
2556
2557 bool
2558 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2559                      bool first ATTRIBUTE_UNUSED)
2560 {
2561
2562   /* Small composite types are always padded upward.  */
2563   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2564     {
2565       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2566                             : GET_MODE_SIZE (mode));
2567       if (size < 2 * UNITS_PER_WORD)
2568         return true;
2569     }
2570
2571   /* Otherwise, use the default padding.  */
2572   return !BYTES_BIG_ENDIAN;
2573 }
2574
2575 static machine_mode
2576 aarch64_libgcc_cmp_return_mode (void)
2577 {
2578   return SImode;
2579 }
2580
2581 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2582
2583 /* We use the 12-bit shifted immediate arithmetic instructions so values
2584    must be multiple of (1 << 12), i.e. 4096.  */
2585 #define ARITH_FACTOR 4096
2586
2587 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2588 #error Cannot use simple address calculation for stack probing
2589 #endif
2590
2591 /* The pair of scratch registers used for stack probing.  */
2592 #define PROBE_STACK_FIRST_REG  9
2593 #define PROBE_STACK_SECOND_REG 10
2594
2595 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2596    inclusive.  These are offsets from the current stack pointer.  */
2597
2598 static void
2599 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2600 {
2601   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2602
2603   /* See the same assertion on PROBE_INTERVAL above.  */
2604   gcc_assert ((first % ARITH_FACTOR) == 0);
2605
2606   /* See if we have a constant small number of probes to generate.  If so,
2607      that's the easy case.  */
2608   if (size <= PROBE_INTERVAL)
2609     {
2610       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2611
2612       emit_set_insn (reg1,
2613                      plus_constant (ptr_mode,
2614                                     stack_pointer_rtx, -(first + base)));
2615       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2616     }
2617
2618   /* The run-time loop is made up of 8 insns in the generic case while the
2619      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2620   else if (size <= 4 * PROBE_INTERVAL)
2621     {
2622       HOST_WIDE_INT i, rem;
2623
2624       emit_set_insn (reg1,
2625                      plus_constant (ptr_mode,
2626                                     stack_pointer_rtx,
2627                                     -(first + PROBE_INTERVAL)));
2628       emit_stack_probe (reg1);
2629
2630       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2631          it exceeds SIZE.  If only two probes are needed, this will not
2632          generate any code.  Then probe at FIRST + SIZE.  */
2633       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2634         {
2635           emit_set_insn (reg1,
2636                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2637           emit_stack_probe (reg1);
2638         }
2639
2640       rem = size - (i - PROBE_INTERVAL);
2641       if (rem > 256)
2642         {
2643           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2644
2645           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2646           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2647         }
2648       else
2649         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2650     }
2651
2652   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2653      extra careful with variables wrapping around because we might be at
2654      the very top (or the very bottom) of the address space and we have
2655      to be able to handle this case properly; in particular, we use an
2656      equality test for the loop condition.  */
2657   else
2658     {
2659       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2660
2661       /* Step 1: round SIZE to the previous multiple of the interval.  */
2662
2663       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2664
2665
2666       /* Step 2: compute initial and final value of the loop counter.  */
2667
2668       /* TEST_ADDR = SP + FIRST.  */
2669       emit_set_insn (reg1,
2670                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2671
2672       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2673       emit_set_insn (reg2,
2674                      plus_constant (ptr_mode, stack_pointer_rtx,
2675                                     -(first + rounded_size)));
2676
2677
2678       /* Step 3: the loop
2679
2680          do
2681            {
2682              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2683              probe at TEST_ADDR
2684            }
2685          while (TEST_ADDR != LAST_ADDR)
2686
2687          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2688          until it is equal to ROUNDED_SIZE.  */
2689
2690       if (ptr_mode == DImode)
2691         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2692       else
2693         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2694
2695
2696       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2697          that SIZE is equal to ROUNDED_SIZE.  */
2698
2699       if (size != rounded_size)
2700         {
2701           HOST_WIDE_INT rem = size - rounded_size;
2702
2703           if (rem > 256)
2704             {
2705               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2706
2707               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2708               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2709             }
2710           else
2711             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2712         }
2713     }
2714
2715   /* Make sure nothing is scheduled before we are done.  */
2716   emit_insn (gen_blockage ());
2717 }
2718
2719 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2720    absolute addresses.  */
2721
2722 const char *
2723 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2724 {
2725   static int labelno = 0;
2726   char loop_lab[32];
2727   rtx xops[2];
2728
2729   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2730
2731   /* Loop.  */
2732   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2733
2734   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2735   xops[0] = reg1;
2736   xops[1] = GEN_INT (PROBE_INTERVAL);
2737   output_asm_insn ("sub\t%0, %0, %1", xops);
2738
2739   /* Probe at TEST_ADDR.  */
2740   output_asm_insn ("str\txzr, [%0]", xops);
2741
2742   /* Test if TEST_ADDR == LAST_ADDR.  */
2743   xops[1] = reg2;
2744   output_asm_insn ("cmp\t%0, %1", xops);
2745
2746   /* Branch.  */
2747   fputs ("\tb.ne\t", asm_out_file);
2748   assemble_name_raw (asm_out_file, loop_lab);
2749   fputc ('\n', asm_out_file);
2750
2751   return "";
2752 }
2753
2754 static bool
2755 aarch64_frame_pointer_required (void)
2756 {
2757   /* In aarch64_override_options_after_change
2758      flag_omit_leaf_frame_pointer turns off the frame pointer by
2759      default.  Turn it back on now if we've not got a leaf
2760      function.  */
2761   if (flag_omit_leaf_frame_pointer
2762       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2763     return true;
2764
2765   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2766   if (crtl->calls_eh_return)
2767     return true;
2768
2769   return false;
2770 }
2771
2772 /* Mark the registers that need to be saved by the callee and calculate
2773    the size of the callee-saved registers area and frame record (both FP
2774    and LR may be omitted).  */
2775 static void
2776 aarch64_layout_frame (void)
2777 {
2778   HOST_WIDE_INT offset = 0;
2779   int regno, last_fp_reg = INVALID_REGNUM;
2780
2781   if (reload_completed && cfun->machine->frame.laid_out)
2782     return;
2783
2784 #define SLOT_NOT_REQUIRED (-2)
2785 #define SLOT_REQUIRED     (-1)
2786
2787   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2788   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2789
2790   /* First mark all the registers that really need to be saved...  */
2791   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2792     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2793
2794   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2795     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2796
2797   /* ... that includes the eh data registers (if needed)...  */
2798   if (crtl->calls_eh_return)
2799     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2800       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2801         = SLOT_REQUIRED;
2802
2803   /* ... and any callee saved register that dataflow says is live.  */
2804   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2805     if (df_regs_ever_live_p (regno)
2806         && (regno == R30_REGNUM
2807             || !call_used_regs[regno]))
2808       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2809
2810   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2811     if (df_regs_ever_live_p (regno)
2812         && !call_used_regs[regno])
2813       {
2814         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2815         last_fp_reg = regno;
2816       }
2817
2818   if (frame_pointer_needed)
2819     {
2820       /* FP and LR are placed in the linkage record.  */
2821       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2822       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2823       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2824       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2825       offset += 2 * UNITS_PER_WORD;
2826     }
2827
2828   /* Now assign stack slots for them.  */
2829   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2830     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2831       {
2832         cfun->machine->frame.reg_offset[regno] = offset;
2833         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2834           cfun->machine->frame.wb_candidate1 = regno;
2835         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2836           cfun->machine->frame.wb_candidate2 = regno;
2837         offset += UNITS_PER_WORD;
2838       }
2839
2840   HOST_WIDE_INT max_int_offset = offset;
2841   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2842   bool has_align_gap = offset != max_int_offset;
2843
2844   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2845     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2846       {
2847         /* If there is an alignment gap between integer and fp callee-saves,
2848            allocate the last fp register to it if possible.  */
2849         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2850           {
2851             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2852             break;
2853           }
2854
2855         cfun->machine->frame.reg_offset[regno] = offset;
2856         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2857           cfun->machine->frame.wb_candidate1 = regno;
2858         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2859                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2860           cfun->machine->frame.wb_candidate2 = regno;
2861         offset += UNITS_PER_WORD;
2862       }
2863
2864   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2865
2866   cfun->machine->frame.saved_regs_size = offset;
2867
2868   HOST_WIDE_INT varargs_and_saved_regs_size
2869     = offset + cfun->machine->frame.saved_varargs_size;
2870
2871   cfun->machine->frame.hard_fp_offset
2872     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2873                 STACK_BOUNDARY / BITS_PER_UNIT);
2874
2875   cfun->machine->frame.frame_size
2876     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2877                 + crtl->outgoing_args_size,
2878                 STACK_BOUNDARY / BITS_PER_UNIT);
2879
2880   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2881
2882   cfun->machine->frame.initial_adjust = 0;
2883   cfun->machine->frame.final_adjust = 0;
2884   cfun->machine->frame.callee_adjust = 0;
2885   cfun->machine->frame.callee_offset = 0;
2886
2887   HOST_WIDE_INT max_push_offset = 0;
2888   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2889     max_push_offset = 512;
2890   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2891     max_push_offset = 256;
2892
2893   if (cfun->machine->frame.frame_size < max_push_offset
2894       && crtl->outgoing_args_size == 0)
2895     {
2896       /* Simple, small frame with no outgoing arguments:
2897          stp reg1, reg2, [sp, -frame_size]!
2898          stp reg3, reg4, [sp, 16]  */
2899       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2900     }
2901   else if ((crtl->outgoing_args_size
2902             + cfun->machine->frame.saved_regs_size < 512)
2903            && !(cfun->calls_alloca
2904                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2905     {
2906       /* Frame with small outgoing arguments:
2907          sub sp, sp, frame_size
2908          stp reg1, reg2, [sp, outgoing_args_size]
2909          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2910       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2911       cfun->machine->frame.callee_offset
2912         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2913     }
2914   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2915     {
2916       /* Frame with large outgoing arguments but a small local area:
2917          stp reg1, reg2, [sp, -hard_fp_offset]!
2918          stp reg3, reg4, [sp, 16]
2919          sub sp, sp, outgoing_args_size  */
2920       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2921       cfun->machine->frame.final_adjust
2922         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2923     }
2924   else if (!frame_pointer_needed
2925            && varargs_and_saved_regs_size < max_push_offset)
2926     {
2927       /* Frame with large local area and outgoing arguments (this pushes the
2928          callee-saves first, followed by the locals and outgoing area):
2929          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2930          stp reg3, reg4, [sp, 16]
2931          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2932       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2933       cfun->machine->frame.final_adjust
2934         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2935       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2936       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2937     }
2938   else
2939     {
2940       /* Frame with large local area and outgoing arguments using frame pointer:
2941          sub sp, sp, hard_fp_offset
2942          stp x29, x30, [sp, 0]
2943          add x29, sp, 0
2944          stp reg3, reg4, [sp, 16]
2945          sub sp, sp, outgoing_args_size  */
2946       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2947       cfun->machine->frame.final_adjust
2948         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2949     }
2950
2951   cfun->machine->frame.laid_out = true;
2952 }
2953
2954 /* Return true if the register REGNO is saved on entry to
2955    the current function.  */
2956
2957 static bool
2958 aarch64_register_saved_on_entry (int regno)
2959 {
2960   return cfun->machine->frame.reg_offset[regno] >= 0;
2961 }
2962
2963 /* Return the next register up from REGNO up to LIMIT for the callee
2964    to save.  */
2965
2966 static unsigned
2967 aarch64_next_callee_save (unsigned regno, unsigned limit)
2968 {
2969   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2970     regno ++;
2971   return regno;
2972 }
2973
2974 /* Push the register number REGNO of mode MODE to the stack with write-back
2975    adjusting the stack by ADJUSTMENT.  */
2976
2977 static void
2978 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2979                            HOST_WIDE_INT adjustment)
2980  {
2981   rtx base_rtx = stack_pointer_rtx;
2982   rtx insn, reg, mem;
2983
2984   reg = gen_rtx_REG (mode, regno);
2985   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2986                             plus_constant (Pmode, base_rtx, -adjustment));
2987   mem = gen_rtx_MEM (mode, mem);
2988
2989   insn = emit_move_insn (mem, reg);
2990   RTX_FRAME_RELATED_P (insn) = 1;
2991 }
2992
2993 /* Generate and return an instruction to store the pair of registers
2994    REG and REG2 of mode MODE to location BASE with write-back adjusting
2995    the stack location BASE by ADJUSTMENT.  */
2996
2997 static rtx
2998 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2999                           HOST_WIDE_INT adjustment)
3000 {
3001   switch (mode)
3002     {
3003     case DImode:
3004       return gen_storewb_pairdi_di (base, base, reg, reg2,
3005                                     GEN_INT (-adjustment),
3006                                     GEN_INT (UNITS_PER_WORD - adjustment));
3007     case DFmode:
3008       return gen_storewb_pairdf_di (base, base, reg, reg2,
3009                                     GEN_INT (-adjustment),
3010                                     GEN_INT (UNITS_PER_WORD - adjustment));
3011     default:
3012       gcc_unreachable ();
3013     }
3014 }
3015
3016 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3017    stack pointer by ADJUSTMENT.  */
3018
3019 static void
3020 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3021 {
3022   rtx_insn *insn;
3023   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3024
3025   if (regno2 == INVALID_REGNUM)
3026     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3027
3028   rtx reg1 = gen_rtx_REG (mode, regno1);
3029   rtx reg2 = gen_rtx_REG (mode, regno2);
3030
3031   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3032                                               reg2, adjustment));
3033   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3034   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3035   RTX_FRAME_RELATED_P (insn) = 1;
3036 }
3037
3038 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3039    adjusting it by ADJUSTMENT afterwards.  */
3040
3041 static rtx
3042 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3043                          HOST_WIDE_INT adjustment)
3044 {
3045   switch (mode)
3046     {
3047     case DImode:
3048       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3049                                    GEN_INT (UNITS_PER_WORD));
3050     case DFmode:
3051       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3052                                    GEN_INT (UNITS_PER_WORD));
3053     default:
3054       gcc_unreachable ();
3055     }
3056 }
3057
3058 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3059    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3060    into CFI_OPS.  */
3061
3062 static void
3063 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3064                   rtx *cfi_ops)
3065 {
3066   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3067   rtx reg1 = gen_rtx_REG (mode, regno1);
3068
3069   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3070
3071   if (regno2 == INVALID_REGNUM)
3072     {
3073       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3074       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3075       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3076     }
3077   else
3078     {
3079       rtx reg2 = gen_rtx_REG (mode, regno2);
3080       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3081       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3082                                           reg2, adjustment));
3083     }
3084 }
3085
3086 /* Generate and return a store pair instruction of mode MODE to store
3087    register REG1 to MEM1 and register REG2 to MEM2.  */
3088
3089 static rtx
3090 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3091                         rtx reg2)
3092 {
3093   switch (mode)
3094     {
3095     case DImode:
3096       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3097
3098     case DFmode:
3099       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3100
3101     default:
3102       gcc_unreachable ();
3103     }
3104 }
3105
3106 /* Generate and regurn a load pair isntruction of mode MODE to load register
3107    REG1 from MEM1 and register REG2 from MEM2.  */
3108
3109 static rtx
3110 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3111                        rtx mem2)
3112 {
3113   switch (mode)
3114     {
3115     case DImode:
3116       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3117
3118     case DFmode:
3119       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3120
3121     default:
3122       gcc_unreachable ();
3123     }
3124 }
3125
3126 /* Return TRUE if return address signing should be enabled for the current
3127    function, otherwise return FALSE.  */
3128
3129 bool
3130 aarch64_return_address_signing_enabled (void)
3131 {
3132   /* This function should only be called after frame laid out.   */
3133   gcc_assert (cfun->machine->frame.laid_out);
3134
3135   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3136      if it's LR is pushed onto stack.  */
3137   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3138           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3139               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3140 }
3141
3142 /* Emit code to save the callee-saved registers from register number START
3143    to LIMIT to the stack at the location starting at offset START_OFFSET,
3144    skipping any write-back candidates if SKIP_WB is true.  */
3145
3146 static void
3147 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3148                            unsigned start, unsigned limit, bool skip_wb)
3149 {
3150   rtx_insn *insn;
3151   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3152                                                  ? gen_frame_mem : gen_rtx_MEM);
3153   unsigned regno;
3154   unsigned regno2;
3155
3156   for (regno = aarch64_next_callee_save (start, limit);
3157        regno <= limit;
3158        regno = aarch64_next_callee_save (regno + 1, limit))
3159     {
3160       rtx reg, mem;
3161       HOST_WIDE_INT offset;
3162
3163       if (skip_wb
3164           && (regno == cfun->machine->frame.wb_candidate1
3165               || regno == cfun->machine->frame.wb_candidate2))
3166         continue;
3167
3168       if (cfun->machine->reg_is_wrapped_separately[regno])
3169        continue;
3170
3171       reg = gen_rtx_REG (mode, regno);
3172       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3173       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3174                                               offset));
3175
3176       regno2 = aarch64_next_callee_save (regno + 1, limit);
3177
3178       if (regno2 <= limit
3179           && !cfun->machine->reg_is_wrapped_separately[regno2]
3180           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3181               == cfun->machine->frame.reg_offset[regno2]))
3182
3183         {
3184           rtx reg2 = gen_rtx_REG (mode, regno2);
3185           rtx mem2;
3186
3187           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3188           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3189                                                    offset));
3190           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3191                                                     reg2));
3192
3193           /* The first part of a frame-related parallel insn is
3194              always assumed to be relevant to the frame
3195              calculations; subsequent parts, are only
3196              frame-related if explicitly marked.  */
3197           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3198           regno = regno2;
3199         }
3200       else
3201         insn = emit_move_insn (mem, reg);
3202
3203       RTX_FRAME_RELATED_P (insn) = 1;
3204     }
3205 }
3206
3207 /* Emit code to restore the callee registers of mode MODE from register
3208    number START up to and including LIMIT.  Restore from the stack offset
3209    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3210    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3211
3212 static void
3213 aarch64_restore_callee_saves (machine_mode mode,
3214                               HOST_WIDE_INT start_offset, unsigned start,
3215                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3216 {
3217   rtx base_rtx = stack_pointer_rtx;
3218   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3219                                                  ? gen_frame_mem : gen_rtx_MEM);
3220   unsigned regno;
3221   unsigned regno2;
3222   HOST_WIDE_INT offset;
3223
3224   for (regno = aarch64_next_callee_save (start, limit);
3225        regno <= limit;
3226        regno = aarch64_next_callee_save (regno + 1, limit))
3227     {
3228       if (cfun->machine->reg_is_wrapped_separately[regno])
3229        continue;
3230
3231       rtx reg, mem;
3232
3233       if (skip_wb
3234           && (regno == cfun->machine->frame.wb_candidate1
3235               || regno == cfun->machine->frame.wb_candidate2))
3236         continue;
3237
3238       reg = gen_rtx_REG (mode, regno);
3239       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3240       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3241
3242       regno2 = aarch64_next_callee_save (regno + 1, limit);
3243
3244       if (regno2 <= limit
3245           && !cfun->machine->reg_is_wrapped_separately[regno2]
3246           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3247               == cfun->machine->frame.reg_offset[regno2]))
3248         {
3249           rtx reg2 = gen_rtx_REG (mode, regno2);
3250           rtx mem2;
3251
3252           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3253           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3254           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3255
3256           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3257           regno = regno2;
3258         }
3259       else
3260         emit_move_insn (reg, mem);
3261       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3262     }
3263 }
3264
3265 static inline bool
3266 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3267                                HOST_WIDE_INT offset)
3268 {
3269   return offset >= -256 && offset < 256;
3270 }
3271
3272 static inline bool
3273 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3274 {
3275   return (offset >= 0
3276           && offset < 4096 * GET_MODE_SIZE (mode)
3277           && offset % GET_MODE_SIZE (mode) == 0);
3278 }
3279
3280 bool
3281 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3282 {
3283   return (offset >= -64 * GET_MODE_SIZE (mode)
3284           && offset < 64 * GET_MODE_SIZE (mode)
3285           && offset % GET_MODE_SIZE (mode) == 0);
3286 }
3287
3288 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3289
3290 static sbitmap
3291 aarch64_get_separate_components (void)
3292 {
3293   aarch64_layout_frame ();
3294
3295   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3296   bitmap_clear (components);
3297
3298   /* The registers we need saved to the frame.  */
3299   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3300     if (aarch64_register_saved_on_entry (regno))
3301       {
3302         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3303         if (!frame_pointer_needed)
3304           offset += cfun->machine->frame.frame_size
3305                     - cfun->machine->frame.hard_fp_offset;
3306         /* Check that we can access the stack slot of the register with one
3307            direct load with no adjustments needed.  */
3308         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3309           bitmap_set_bit (components, regno);
3310       }
3311
3312   /* Don't mess with the hard frame pointer.  */
3313   if (frame_pointer_needed)
3314     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3315
3316   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3317   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3318   /* If aarch64_layout_frame has chosen registers to store/restore with
3319      writeback don't interfere with them to avoid having to output explicit
3320      stack adjustment instructions.  */
3321   if (reg2 != INVALID_REGNUM)
3322     bitmap_clear_bit (components, reg2);
3323   if (reg1 != INVALID_REGNUM)
3324     bitmap_clear_bit (components, reg1);
3325
3326   bitmap_clear_bit (components, LR_REGNUM);
3327   bitmap_clear_bit (components, SP_REGNUM);
3328
3329   return components;
3330 }
3331
3332 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3333
3334 static sbitmap
3335 aarch64_components_for_bb (basic_block bb)
3336 {
3337   bitmap in = DF_LIVE_IN (bb);
3338   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3339   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3340
3341   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3342   bitmap_clear (components);
3343
3344   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3345   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3346     if ((!call_used_regs[regno])
3347        && (bitmap_bit_p (in, regno)
3348            || bitmap_bit_p (gen, regno)
3349            || bitmap_bit_p (kill, regno)))
3350           bitmap_set_bit (components, regno);
3351
3352   return components;
3353 }
3354
3355 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3356    Nothing to do for aarch64.  */
3357
3358 static void
3359 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3360 {
3361 }
3362
3363 /* Return the next set bit in BMP from START onwards.  Return the total number
3364    of bits in BMP if no set bit is found at or after START.  */
3365
3366 static unsigned int
3367 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3368 {
3369   unsigned int nbits = SBITMAP_SIZE (bmp);
3370   if (start == nbits)
3371     return start;
3372
3373   gcc_assert (start < nbits);
3374   for (unsigned int i = start; i < nbits; i++)
3375     if (bitmap_bit_p (bmp, i))
3376       return i;
3377
3378   return nbits;
3379 }
3380
3381 /* Do the work for aarch64_emit_prologue_components and
3382    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3383    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3384    for these components or the epilogue sequence.  That is, it determines
3385    whether we should emit stores or loads and what kind of CFA notes to attach
3386    to the insns.  Otherwise the logic for the two sequences is very
3387    similar.  */
3388
3389 static void
3390 aarch64_process_components (sbitmap components, bool prologue_p)
3391 {
3392   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3393                              ? HARD_FRAME_POINTER_REGNUM
3394                              : STACK_POINTER_REGNUM);
3395
3396   unsigned last_regno = SBITMAP_SIZE (components);
3397   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3398   rtx_insn *insn = NULL;
3399
3400   while (regno != last_regno)
3401     {
3402       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3403          so DFmode for the vector registers is enough.  */
3404       machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3405       rtx reg = gen_rtx_REG (mode, regno);
3406       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3407       if (!frame_pointer_needed)
3408         offset += cfun->machine->frame.frame_size
3409                   - cfun->machine->frame.hard_fp_offset;
3410       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3411       rtx mem = gen_frame_mem (mode, addr);
3412
3413       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3414       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3415       /* No more registers to handle after REGNO.
3416          Emit a single save/restore and exit.  */
3417       if (regno2 == last_regno)
3418         {
3419           insn = emit_insn (set);
3420           RTX_FRAME_RELATED_P (insn) = 1;
3421           if (prologue_p)
3422             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3423           else
3424             add_reg_note (insn, REG_CFA_RESTORE, reg);
3425           break;
3426         }
3427
3428       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3429       /* The next register is not of the same class or its offset is not
3430          mergeable with the current one into a pair.  */
3431       if (!satisfies_constraint_Ump (mem)
3432           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3433           || (offset2 - cfun->machine->frame.reg_offset[regno])
3434                 != GET_MODE_SIZE (mode))
3435         {
3436           insn = emit_insn (set);
3437           RTX_FRAME_RELATED_P (insn) = 1;
3438           if (prologue_p)
3439             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3440           else
3441             add_reg_note (insn, REG_CFA_RESTORE, reg);
3442
3443           regno = regno2;
3444           continue;
3445         }
3446
3447       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3448       rtx reg2 = gen_rtx_REG (mode, regno2);
3449       if (!frame_pointer_needed)
3450         offset2 += cfun->machine->frame.frame_size
3451                   - cfun->machine->frame.hard_fp_offset;
3452       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3453       rtx mem2 = gen_frame_mem (mode, addr2);
3454       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3455                              : gen_rtx_SET (reg2, mem2);
3456
3457       if (prologue_p)
3458         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3459       else
3460         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3461
3462       RTX_FRAME_RELATED_P (insn) = 1;
3463       if (prologue_p)
3464         {
3465           add_reg_note (insn, REG_CFA_OFFSET, set);
3466           add_reg_note (insn, REG_CFA_OFFSET, set2);
3467         }
3468       else
3469         {
3470           add_reg_note (insn, REG_CFA_RESTORE, reg);
3471           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3472         }
3473
3474       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3475     }
3476 }
3477
3478 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3479
3480 static void
3481 aarch64_emit_prologue_components (sbitmap components)
3482 {
3483   aarch64_process_components (components, true);
3484 }
3485
3486 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3487
3488 static void
3489 aarch64_emit_epilogue_components (sbitmap components)
3490 {
3491   aarch64_process_components (components, false);
3492 }
3493
3494 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3495
3496 static void
3497 aarch64_set_handled_components (sbitmap components)
3498 {
3499   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3500     if (bitmap_bit_p (components, regno))
3501       cfun->machine->reg_is_wrapped_separately[regno] = true;
3502 }
3503
3504 /* AArch64 stack frames generated by this compiler look like:
3505
3506         +-------------------------------+
3507         |                               |
3508         |  incoming stack arguments     |
3509         |                               |
3510         +-------------------------------+
3511         |                               | <-- incoming stack pointer (aligned)
3512         |  callee-allocated save area   |
3513         |  for register varargs         |
3514         |                               |
3515         +-------------------------------+
3516         |  local variables              | <-- frame_pointer_rtx
3517         |                               |
3518         +-------------------------------+
3519         |  padding0                     | \
3520         +-------------------------------+  |
3521         |  callee-saved registers       |  | frame.saved_regs_size
3522         +-------------------------------+  |
3523         |  LR'                          |  |
3524         +-------------------------------+  |
3525         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3526         +-------------------------------+
3527         |  dynamic allocation           |
3528         +-------------------------------+
3529         |  padding                      |
3530         +-------------------------------+
3531         |  outgoing stack arguments     | <-- arg_pointer
3532         |                               |
3533         +-------------------------------+
3534         |                               | <-- stack_pointer_rtx (aligned)
3535
3536    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3537    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3538    unchanged.  */
3539
3540 /* Generate the prologue instructions for entry into a function.
3541    Establish the stack frame by decreasing the stack pointer with a
3542    properly calculated size and, if necessary, create a frame record
3543    filled with the values of LR and previous frame pointer.  The
3544    current FP is also set up if it is in use.  */
3545
3546 void
3547 aarch64_expand_prologue (void)
3548 {
3549   aarch64_layout_frame ();
3550
3551   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3552   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3553   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3554   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3555   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3556   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3557   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3558   rtx_insn *insn;
3559
3560   /* Sign return address for functions.  */
3561   if (aarch64_return_address_signing_enabled ())
3562     {
3563       insn = emit_insn (gen_pacisp ());
3564       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3565       RTX_FRAME_RELATED_P (insn) = 1;
3566     }
3567
3568   if (flag_stack_usage_info)
3569     current_function_static_stack_size = frame_size;
3570
3571   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3572     {
3573       if (crtl->is_leaf && !cfun->calls_alloca)
3574         {
3575           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3576             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3577                                             frame_size - STACK_CHECK_PROTECT);
3578         }
3579       else if (frame_size > 0)
3580         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3581     }
3582
3583   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3584
3585   if (callee_adjust != 0)
3586     aarch64_push_regs (reg1, reg2, callee_adjust);
3587
3588   if (frame_pointer_needed)
3589     {
3590       if (callee_adjust == 0)
3591         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3592                                    R30_REGNUM, false);
3593       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3594                                        stack_pointer_rtx,
3595                                        GEN_INT (callee_offset)));
3596       RTX_FRAME_RELATED_P (insn) = 1;
3597       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3598     }
3599
3600   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3601                              callee_adjust != 0 || frame_pointer_needed);
3602   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3603                              callee_adjust != 0 || frame_pointer_needed);
3604   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3605 }
3606
3607 /* Return TRUE if we can use a simple_return insn.
3608
3609    This function checks whether the callee saved stack is empty, which
3610    means no restore actions are need. The pro_and_epilogue will use
3611    this to check whether shrink-wrapping opt is feasible.  */
3612
3613 bool
3614 aarch64_use_return_insn_p (void)
3615 {
3616   if (!reload_completed)
3617     return false;
3618
3619   if (crtl->profile)
3620     return false;
3621
3622   aarch64_layout_frame ();
3623
3624   return cfun->machine->frame.frame_size == 0;
3625 }
3626
3627 /* Generate the epilogue instructions for returning from a function.
3628    This is almost exactly the reverse of the prolog sequence, except
3629    that we need to insert barriers to avoid scheduling loads that read
3630    from a deallocated stack, and we optimize the unwind records by
3631    emitting them all together if possible.  */
3632 void
3633 aarch64_expand_epilogue (bool for_sibcall)
3634 {
3635   aarch64_layout_frame ();
3636
3637   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3638   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3639   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3640   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3641   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3642   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3643   rtx cfi_ops = NULL;
3644   rtx_insn *insn;
3645
3646   /* We need to add memory barrier to prevent read from deallocated stack.  */
3647   bool need_barrier_p = (get_frame_size ()
3648                          + cfun->machine->frame.saved_varargs_size) != 0;
3649
3650   /* Emit a barrier to prevent loads from a deallocated stack.  */
3651   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3652       || crtl->calls_eh_return)
3653     {
3654       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3655       need_barrier_p = false;
3656     }
3657
3658   /* Restore the stack pointer from the frame pointer if it may not
3659      be the same as the stack pointer.  */
3660   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3661     {
3662       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3663                                        hard_frame_pointer_rtx,
3664                                        GEN_INT (-callee_offset)));
3665       /* If writeback is used when restoring callee-saves, the CFA
3666          is restored on the instruction doing the writeback.  */
3667       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3668     }
3669   else
3670     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3671
3672   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3673                                 callee_adjust != 0, &cfi_ops);
3674   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3675                                 callee_adjust != 0, &cfi_ops);
3676
3677   if (need_barrier_p)
3678     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3679
3680   if (callee_adjust != 0)
3681     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3682
3683   if (callee_adjust != 0 || initial_adjust > 65536)
3684     {
3685       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3686       insn = get_last_insn ();
3687       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3688       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3689       RTX_FRAME_RELATED_P (insn) = 1;
3690       cfi_ops = NULL;
3691     }
3692
3693   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3694
3695   if (cfi_ops)
3696     {
3697       /* Emit delayed restores and reset the CFA to be SP.  */
3698       insn = get_last_insn ();
3699       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3700       REG_NOTES (insn) = cfi_ops;
3701       RTX_FRAME_RELATED_P (insn) = 1;
3702     }
3703
3704   /* We prefer to emit the combined return/authenticate instruction RETAA,
3705      however there are three cases in which we must instead emit an explicit
3706      authentication instruction.
3707
3708         1) Sibcalls don't return in a normal way, so if we're about to call one
3709            we must authenticate.
3710
3711         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3712            generating code for !TARGET_ARMV8_3 we can't use it and must
3713            explicitly authenticate.
3714
3715         3) On an eh_return path we make extra stack adjustments to update the
3716            canonical frame address to be the exception handler's CFA.  We want
3717            to authenticate using the CFA of the function which calls eh_return.
3718     */
3719   if (aarch64_return_address_signing_enabled ()
3720       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3721     {
3722       insn = emit_insn (gen_autisp ());
3723       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3724       RTX_FRAME_RELATED_P (insn) = 1;
3725     }
3726
3727   /* Stack adjustment for exception handler.  */
3728   if (crtl->calls_eh_return)
3729     {
3730       /* We need to unwind the stack by the offset computed by
3731          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3732          to be SP; letting the CFA move during this adjustment
3733          is just as correct as retaining the CFA from the body
3734          of the function.  Therefore, do nothing special.  */
3735       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3736     }
3737
3738   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3739   if (!for_sibcall)
3740     emit_jump_insn (ret_rtx);
3741 }
3742
3743 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3744    normally or return to a previous frame after unwinding.
3745
3746    An EH return uses a single shared return sequence.  The epilogue is
3747    exactly like a normal epilogue except that it has an extra input
3748    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3749    that must be applied after the frame has been destroyed.  An extra label
3750    is inserted before the epilogue which initializes this register to zero,
3751    and this is the entry point for a normal return.
3752
3753    An actual EH return updates the return address, initializes the stack
3754    adjustment and jumps directly into the epilogue (bypassing the zeroing
3755    of the adjustment).  Since the return address is typically saved on the
3756    stack when a function makes a call, the saved LR must be updated outside
3757    the epilogue.
3758
3759    This poses problems as the store is generated well before the epilogue,
3760    so the offset of LR is not known yet.  Also optimizations will remove the
3761    store as it appears dead, even after the epilogue is generated (as the
3762    base or offset for loading LR is different in many cases).
3763
3764    To avoid these problems this implementation forces the frame pointer
3765    in eh_return functions so that the location of LR is fixed and known early.
3766    It also marks the store volatile, so no optimization is permitted to
3767    remove the store.  */
3768 rtx
3769 aarch64_eh_return_handler_rtx (void)
3770 {
3771   rtx tmp = gen_frame_mem (Pmode,
3772     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3773
3774   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3775   MEM_VOLATILE_P (tmp) = true;
3776   return tmp;
3777 }
3778
3779 /* Output code to add DELTA to the first argument, and then jump
3780    to FUNCTION.  Used for C++ multiple inheritance.  */
3781 static void
3782 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3783                          HOST_WIDE_INT delta,
3784                          HOST_WIDE_INT vcall_offset,
3785                          tree function)
3786 {
3787   /* The this pointer is always in x0.  Note that this differs from
3788      Arm where the this pointer maybe bumped to r1 if r0 is required
3789      to return a pointer to an aggregate.  On AArch64 a result value
3790      pointer will be in x8.  */
3791   int this_regno = R0_REGNUM;
3792   rtx this_rtx, temp0, temp1, addr, funexp;
3793   rtx_insn *insn;
3794
3795   reload_completed = 1;
3796   emit_note (NOTE_INSN_PROLOGUE_END);
3797
3798   if (vcall_offset == 0)
3799     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3800   else
3801     {
3802       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3803
3804       this_rtx = gen_rtx_REG (Pmode, this_regno);
3805       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3806       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3807
3808       addr = this_rtx;
3809       if (delta != 0)
3810         {
3811           if (delta >= -256 && delta < 256)
3812             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3813                                        plus_constant (Pmode, this_rtx, delta));
3814           else
3815             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3816         }
3817
3818       if (Pmode == ptr_mode)
3819         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3820       else
3821         aarch64_emit_move (temp0,
3822                            gen_rtx_ZERO_EXTEND (Pmode,
3823                                                 gen_rtx_MEM (ptr_mode, addr)));
3824
3825       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3826           addr = plus_constant (Pmode, temp0, vcall_offset);
3827       else
3828         {
3829           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3830                                           Pmode);
3831           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3832         }
3833
3834       if (Pmode == ptr_mode)
3835         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3836       else
3837         aarch64_emit_move (temp1,
3838                            gen_rtx_SIGN_EXTEND (Pmode,
3839                                                 gen_rtx_MEM (ptr_mode, addr)));
3840
3841       emit_insn (gen_add2_insn (this_rtx, temp1));
3842     }
3843
3844   /* Generate a tail call to the target function.  */
3845   if (!TREE_USED (function))
3846     {
3847       assemble_external (function);
3848       TREE_USED (function) = 1;
3849     }
3850   funexp = XEXP (DECL_RTL (function), 0);
3851   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3852   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3853   SIBLING_CALL_P (insn) = 1;
3854
3855   insn = get_insns ();
3856   shorten_branches (insn);
3857   final_start_function (insn, file, 1);
3858   final (insn, file, 1);
3859   final_end_function ();
3860
3861   /* Stop pretending to be a post-reload pass.  */
3862   reload_completed = 0;
3863 }
3864
3865 static bool
3866 aarch64_tls_referenced_p (rtx x)
3867 {
3868   if (!TARGET_HAVE_TLS)
3869     return false;
3870   subrtx_iterator::array_type array;
3871   FOR_EACH_SUBRTX (iter, array, x, ALL)
3872     {
3873       const_rtx x = *iter;
3874       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3875         return true;
3876       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3877          TLS offsets, not real symbol references.  */
3878       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3879         iter.skip_subrtxes ();
3880     }
3881   return false;
3882 }
3883
3884
3885 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3886    a left shift of 0 or 12 bits.  */
3887 bool
3888 aarch64_uimm12_shift (HOST_WIDE_INT val)
3889 {
3890   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3891           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3892           );
3893 }
3894
3895
3896 /* Return true if val is an immediate that can be loaded into a
3897    register by a MOVZ instruction.  */
3898 static bool
3899 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3900 {
3901   if (GET_MODE_SIZE (mode) > 4)
3902     {
3903       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3904           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3905         return 1;
3906     }
3907   else
3908     {
3909       /* Ignore sign extension.  */
3910       val &= (HOST_WIDE_INT) 0xffffffff;
3911     }
3912   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3913           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3914 }
3915
3916 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3917
3918 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3919   {
3920     0x0000000100000001ull,
3921     0x0001000100010001ull,
3922     0x0101010101010101ull,
3923     0x1111111111111111ull,
3924     0x5555555555555555ull,
3925   };
3926
3927
3928 /* Return true if val is a valid bitmask immediate.  */
3929
3930 bool
3931 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3932 {
3933   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3934   int bits;
3935
3936   /* Check for a single sequence of one bits and return quickly if so.
3937      The special cases of all ones and all zeroes returns false.  */
3938   val = (unsigned HOST_WIDE_INT) val_in;
3939   tmp = val + (val & -val);
3940
3941   if (tmp == (tmp & -tmp))
3942     return (val + 1) > 1;
3943
3944   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3945   if (mode == SImode)
3946     val = (val << 32) | (val & 0xffffffff);
3947
3948   /* Invert if the immediate doesn't start with a zero bit - this means we
3949      only need to search for sequences of one bits.  */
3950   if (val & 1)
3951     val = ~val;
3952
3953   /* Find the first set bit and set tmp to val with the first sequence of one
3954      bits removed.  Return success if there is a single sequence of ones.  */
3955   first_one = val & -val;
3956   tmp = val & (val + first_one);
3957
3958   if (tmp == 0)
3959     return true;
3960
3961   /* Find the next set bit and compute the difference in bit position.  */
3962   next_one = tmp & -tmp;
3963   bits = clz_hwi (first_one) - clz_hwi (next_one);
3964   mask = val ^ tmp;
3965
3966   /* Check the bit position difference is a power of 2, and that the first
3967      sequence of one bits fits within 'bits' bits.  */
3968   if ((mask >> bits) != 0 || bits != (bits & -bits))
3969     return false;
3970
3971   /* Check the sequence of one bits is repeated 64/bits times.  */
3972   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3973 }
3974
3975 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
3976    Assumed precondition: VAL_IN Is not zero.  */
3977
3978 unsigned HOST_WIDE_INT
3979 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
3980 {
3981   int lowest_bit_set = ctz_hwi (val_in);
3982   int highest_bit_set = floor_log2 (val_in);
3983   gcc_assert (val_in != 0);
3984
3985   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
3986           (HOST_WIDE_INT_1U << lowest_bit_set));
3987 }
3988
3989 /* Create constant where bits outside of lowest bit set to highest bit set
3990    are set to 1.  */
3991
3992 unsigned HOST_WIDE_INT
3993 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
3994 {
3995   return val_in | ~aarch64_and_split_imm1 (val_in);
3996 }
3997
3998 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
3999
4000 bool
4001 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4002 {
4003   if (aarch64_bitmask_imm (val_in, mode))
4004     return false;
4005
4006   if (aarch64_move_imm (val_in, mode))
4007     return false;
4008
4009   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4010
4011   return aarch64_bitmask_imm (imm2, mode);
4012 }
4013
4014 /* Return true if val is an immediate that can be loaded into a
4015    register in a single instruction.  */
4016 bool
4017 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4018 {
4019   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4020     return 1;
4021   return aarch64_bitmask_imm (val, mode);
4022 }
4023
4024 static bool
4025 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4026 {
4027   rtx base, offset;
4028
4029   if (GET_CODE (x) == HIGH)
4030     return true;
4031
4032   split_const (x, &base, &offset);
4033   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4034     {
4035       if (aarch64_classify_symbol (base, offset)
4036           != SYMBOL_FORCE_TO_MEM)
4037         return true;
4038       else
4039         /* Avoid generating a 64-bit relocation in ILP32; leave
4040            to aarch64_expand_mov_immediate to handle it properly.  */
4041         return mode != ptr_mode;
4042     }
4043
4044   return aarch64_tls_referenced_p (x);
4045 }
4046
4047 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4048    The expansion for a table switch is quite expensive due to the number
4049    of instructions, the table lookup and hard to predict indirect jump.
4050    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4051    set, otherwise use tables for > 16 cases as a tradeoff between size and
4052    performance.  When optimizing for size, use the default setting.  */
4053
4054 static unsigned int
4055 aarch64_case_values_threshold (void)
4056 {
4057   /* Use the specified limit for the number of cases before using jump
4058      tables at higher optimization levels.  */
4059   if (optimize > 2
4060       && selected_cpu->tune->max_case_values != 0)
4061     return selected_cpu->tune->max_case_values;
4062   else
4063     return optimize_size ? default_case_values_threshold () : 17;
4064 }
4065
4066 /* Return true if register REGNO is a valid index register.
4067    STRICT_P is true if REG_OK_STRICT is in effect.  */
4068
4069 bool
4070 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4071 {
4072   if (!HARD_REGISTER_NUM_P (regno))
4073     {
4074       if (!strict_p)
4075         return true;
4076
4077       if (!reg_renumber)
4078         return false;
4079
4080       regno = reg_renumber[regno];
4081     }
4082   return GP_REGNUM_P (regno);
4083 }
4084
4085 /* Return true if register REGNO is a valid base register for mode MODE.
4086    STRICT_P is true if REG_OK_STRICT is in effect.  */
4087
4088 bool
4089 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4090 {
4091   if (!HARD_REGISTER_NUM_P (regno))
4092     {
4093       if (!strict_p)
4094         return true;
4095
4096       if (!reg_renumber)
4097         return false;
4098
4099       regno = reg_renumber[regno];
4100     }
4101
4102   /* The fake registers will be eliminated to either the stack or
4103      hard frame pointer, both of which are usually valid base registers.
4104      Reload deals with the cases where the eliminated form isn't valid.  */
4105   return (GP_REGNUM_P (regno)
4106           || regno == SP_REGNUM
4107           || regno == FRAME_POINTER_REGNUM
4108           || regno == ARG_POINTER_REGNUM);
4109 }
4110
4111 /* Return true if X is a valid base register for mode MODE.
4112    STRICT_P is true if REG_OK_STRICT is in effect.  */
4113
4114 static bool
4115 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4116 {
4117   if (!strict_p && GET_CODE (x) == SUBREG)
4118     x = SUBREG_REG (x);
4119
4120   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4121 }
4122
4123 /* Return true if address offset is a valid index.  If it is, fill in INFO
4124    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4125
4126 static bool
4127 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4128                         machine_mode mode, bool strict_p)
4129 {
4130   enum aarch64_address_type type;
4131   rtx index;
4132   int shift;
4133
4134   /* (reg:P) */
4135   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4136       && GET_MODE (x) == Pmode)
4137     {
4138       type = ADDRESS_REG_REG;
4139       index = x;
4140       shift = 0;
4141     }
4142   /* (sign_extend:DI (reg:SI)) */
4143   else if ((GET_CODE (x) == SIGN_EXTEND
4144             || GET_CODE (x) == ZERO_EXTEND)
4145            && GET_MODE (x) == DImode
4146            && GET_MODE (XEXP (x, 0)) == SImode)
4147     {
4148       type = (GET_CODE (x) == SIGN_EXTEND)
4149         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4150       index = XEXP (x, 0);
4151       shift = 0;
4152     }
4153   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4154   else if (GET_CODE (x) == MULT
4155            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4156                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4157            && GET_MODE (XEXP (x, 0)) == DImode
4158            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4159            && CONST_INT_P (XEXP (x, 1)))
4160     {
4161       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4162         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4163       index = XEXP (XEXP (x, 0), 0);
4164       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4165     }
4166   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4167   else if (GET_CODE (x) == ASHIFT
4168            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4169                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4170            && GET_MODE (XEXP (x, 0)) == DImode
4171            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4172            && CONST_INT_P (XEXP (x, 1)))
4173     {
4174       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4175         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4176       index = XEXP (XEXP (x, 0), 0);
4177       shift = INTVAL (XEXP (x, 1));
4178     }
4179   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4180   else if ((GET_CODE (x) == SIGN_EXTRACT
4181             || GET_CODE (x) == ZERO_EXTRACT)
4182            && GET_MODE (x) == DImode
4183            && GET_CODE (XEXP (x, 0)) == MULT
4184            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4185            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4186     {
4187       type = (GET_CODE (x) == SIGN_EXTRACT)
4188         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4189       index = XEXP (XEXP (x, 0), 0);
4190       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4191       if (INTVAL (XEXP (x, 1)) != 32 + shift
4192           || INTVAL (XEXP (x, 2)) != 0)
4193         shift = -1;
4194     }
4195   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4196      (const_int 0xffffffff<<shift)) */
4197   else if (GET_CODE (x) == AND
4198            && GET_MODE (x) == DImode
4199            && GET_CODE (XEXP (x, 0)) == MULT
4200            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4201            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4202            && CONST_INT_P (XEXP (x, 1)))
4203     {
4204       type = ADDRESS_REG_UXTW;
4205       index = XEXP (XEXP (x, 0), 0);
4206       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4207       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4208         shift = -1;
4209     }
4210   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4211   else if ((GET_CODE (x) == SIGN_EXTRACT
4212             || GET_CODE (x) == ZERO_EXTRACT)
4213            && GET_MODE (x) == DImode
4214            && GET_CODE (XEXP (x, 0)) == ASHIFT
4215            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4216            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4217     {
4218       type = (GET_CODE (x) == SIGN_EXTRACT)
4219         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4220       index = XEXP (XEXP (x, 0), 0);
4221       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4222       if (INTVAL (XEXP (x, 1)) != 32 + shift
4223           || INTVAL (XEXP (x, 2)) != 0)
4224         shift = -1;
4225     }
4226   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4227      (const_int 0xffffffff<<shift)) */
4228   else if (GET_CODE (x) == AND
4229            && GET_MODE (x) == DImode
4230            && GET_CODE (XEXP (x, 0)) == ASHIFT
4231            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4232            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4233            && CONST_INT_P (XEXP (x, 1)))
4234     {
4235       type = ADDRESS_REG_UXTW;
4236       index = XEXP (XEXP (x, 0), 0);
4237       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4238       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4239         shift = -1;
4240     }
4241   /* (mult:P (reg:P) (const_int scale)) */
4242   else if (GET_CODE (x) == MULT
4243            && GET_MODE (x) == Pmode
4244            && GET_MODE (XEXP (x, 0)) == Pmode
4245            && CONST_INT_P (XEXP (x, 1)))
4246     {
4247       type = ADDRESS_REG_REG;
4248       index = XEXP (x, 0);
4249       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4250     }
4251   /* (ashift:P (reg:P) (const_int shift)) */
4252   else if (GET_CODE (x) == ASHIFT
4253            && GET_MODE (x) == Pmode
4254            && GET_MODE (XEXP (x, 0)) == Pmode
4255            && CONST_INT_P (XEXP (x, 1)))
4256     {
4257       type = ADDRESS_REG_REG;
4258       index = XEXP (x, 0);
4259       shift = INTVAL (XEXP (x, 1));
4260     }
4261   else
4262     return false;
4263
4264   if (GET_CODE (index) == SUBREG)
4265     index = SUBREG_REG (index);
4266
4267   if ((shift == 0 ||
4268        (shift > 0 && shift <= 3
4269         && (1 << shift) == GET_MODE_SIZE (mode)))
4270       && REG_P (index)
4271       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4272     {
4273       info->type = type;
4274       info->offset = index;
4275       info->shift = shift;
4276       return true;
4277     }
4278
4279   return false;
4280 }
4281
4282 /* Return true if MODE is one of the modes for which we
4283    support LDP/STP operations.  */
4284
4285 static bool
4286 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4287 {
4288   return mode == SImode || mode == DImode
4289          || mode == SFmode || mode == DFmode
4290          || (aarch64_vector_mode_supported_p (mode)
4291              && GET_MODE_SIZE (mode) == 8);
4292 }
4293
4294 /* Return true if REGNO is a virtual pointer register, or an eliminable
4295    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4296    include stack_pointer or hard_frame_pointer.  */
4297 static bool
4298 virt_or_elim_regno_p (unsigned regno)
4299 {
4300   return ((regno >= FIRST_VIRTUAL_REGISTER
4301            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4302           || regno == FRAME_POINTER_REGNUM
4303           || regno == ARG_POINTER_REGNUM);
4304 }
4305
4306 /* Return true if X is a valid address for machine mode MODE.  If it is,
4307    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4308    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4309
4310 static bool
4311 aarch64_classify_address (struct aarch64_address_info *info,
4312                           rtx x, machine_mode mode,
4313                           RTX_CODE outer_code, bool strict_p)
4314 {
4315   enum rtx_code code = GET_CODE (x);
4316   rtx op0, op1;
4317
4318   /* On BE, we use load/store pair for all large int mode load/stores.
4319      TI/TFmode may also use a load/store pair.  */
4320   bool load_store_pair_p = (outer_code == PARALLEL
4321                             || mode == TImode
4322                             || mode == TFmode
4323                             || (BYTES_BIG_ENDIAN
4324                                 && aarch64_vect_struct_mode_p (mode)));
4325
4326   bool allow_reg_index_p =
4327     !load_store_pair_p
4328     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4329     && !aarch64_vect_struct_mode_p (mode);
4330
4331   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4332      REG addressing.  */
4333   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4334       && (code != POST_INC && code != REG))
4335     return false;
4336
4337   switch (code)
4338     {
4339     case REG:
4340     case SUBREG:
4341       info->type = ADDRESS_REG_IMM;
4342       info->base = x;
4343       info->offset = const0_rtx;
4344       return aarch64_base_register_rtx_p (x, strict_p);
4345
4346     case PLUS:
4347       op0 = XEXP (x, 0);
4348       op1 = XEXP (x, 1);
4349
4350       if (! strict_p
4351           && REG_P (op0)
4352           && virt_or_elim_regno_p (REGNO (op0))
4353           && CONST_INT_P (op1))
4354         {
4355           info->type = ADDRESS_REG_IMM;
4356           info->base = op0;
4357           info->offset = op1;
4358
4359           return true;
4360         }
4361
4362       if (GET_MODE_SIZE (mode) != 0
4363           && CONST_INT_P (op1)
4364           && aarch64_base_register_rtx_p (op0, strict_p))
4365         {
4366           HOST_WIDE_INT offset = INTVAL (op1);
4367
4368           info->type = ADDRESS_REG_IMM;
4369           info->base = op0;
4370           info->offset = op1;
4371
4372           /* TImode and TFmode values are allowed in both pairs of X
4373              registers and individual Q registers.  The available
4374              address modes are:
4375              X,X: 7-bit signed scaled offset
4376              Q:   9-bit signed offset
4377              We conservatively require an offset representable in either mode.
4378              When performing the check for pairs of X registers i.e.  LDP/STP
4379              pass down DImode since that is the natural size of the LDP/STP
4380              instruction memory accesses.  */
4381           if (mode == TImode || mode == TFmode)
4382             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4383                     && (offset_9bit_signed_unscaled_p (mode, offset)
4384                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4385
4386           /* A 7bit offset check because OImode will emit a ldp/stp
4387              instruction (only big endian will get here).
4388              For ldp/stp instructions, the offset is scaled for the size of a
4389              single element of the pair.  */
4390           if (mode == OImode)
4391             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4392
4393           /* Three 9/12 bit offsets checks because CImode will emit three
4394              ldr/str instructions (only big endian will get here).  */
4395           if (mode == CImode)
4396             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4397                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4398                         || offset_12bit_unsigned_scaled_p (V16QImode,
4399                                                            offset + 32)));
4400
4401           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4402              instructions (only big endian will get here).  */
4403           if (mode == XImode)
4404             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4405                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4406                                                             offset + 32));
4407
4408           if (load_store_pair_p)
4409             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4410                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4411           else
4412             return (offset_9bit_signed_unscaled_p (mode, offset)
4413                     || offset_12bit_unsigned_scaled_p (mode, offset));
4414         }
4415
4416       if (allow_reg_index_p)
4417         {
4418           /* Look for base + (scaled/extended) index register.  */
4419           if (aarch64_base_register_rtx_p (op0, strict_p)
4420               && aarch64_classify_index (info, op1, mode, strict_p))
4421             {
4422               info->base = op0;
4423               return true;
4424             }
4425           if (aarch64_base_register_rtx_p (op1, strict_p)
4426               && aarch64_classify_index (info, op0, mode, strict_p))
4427             {
4428               info->base = op1;
4429               return true;
4430             }
4431         }
4432
4433       return false;
4434
4435     case POST_INC:
4436     case POST_DEC:
4437     case PRE_INC:
4438     case PRE_DEC:
4439       info->type = ADDRESS_REG_WB;
4440       info->base = XEXP (x, 0);
4441       info->offset = NULL_RTX;
4442       return aarch64_base_register_rtx_p (info->base, strict_p);
4443
4444     case POST_MODIFY:
4445     case PRE_MODIFY:
4446       info->type = ADDRESS_REG_WB;
4447       info->base = XEXP (x, 0);
4448       if (GET_CODE (XEXP (x, 1)) == PLUS
4449           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4450           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4451           && aarch64_base_register_rtx_p (info->base, strict_p))
4452         {
4453           HOST_WIDE_INT offset;
4454           info->offset = XEXP (XEXP (x, 1), 1);
4455           offset = INTVAL (info->offset);
4456
4457           /* TImode and TFmode values are allowed in both pairs of X
4458              registers and individual Q registers.  The available
4459              address modes are:
4460              X,X: 7-bit signed scaled offset
4461              Q:   9-bit signed offset
4462              We conservatively require an offset representable in either mode.
4463            */
4464           if (mode == TImode || mode == TFmode)
4465             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4466                     && offset_9bit_signed_unscaled_p (mode, offset));
4467
4468           if (load_store_pair_p)
4469             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4470                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4471           else
4472             return offset_9bit_signed_unscaled_p (mode, offset);
4473         }
4474       return false;
4475
4476     case CONST:
4477     case SYMBOL_REF:
4478     case LABEL_REF:
4479       /* load literal: pc-relative constant pool entry.  Only supported
4480          for SI mode or larger.  */
4481       info->type = ADDRESS_SYMBOLIC;
4482
4483       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4484         {
4485           rtx sym, addend;
4486
4487           split_const (x, &sym, &addend);
4488           return ((GET_CODE (sym) == LABEL_REF
4489                    || (GET_CODE (sym) == SYMBOL_REF
4490                        && CONSTANT_POOL_ADDRESS_P (sym)
4491                        && aarch64_pcrelative_literal_loads)));
4492         }
4493       return false;
4494
4495     case LO_SUM:
4496       info->type = ADDRESS_LO_SUM;
4497       info->base = XEXP (x, 0);
4498       info->offset = XEXP (x, 1);
4499       if (allow_reg_index_p
4500           && aarch64_base_register_rtx_p (info->base, strict_p))
4501         {
4502           rtx sym, offs;
4503           split_const (info->offset, &sym, &offs);
4504           if (GET_CODE (sym) == SYMBOL_REF
4505               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4506             {
4507               /* The symbol and offset must be aligned to the access size.  */
4508               unsigned int align;
4509               unsigned int ref_size;
4510
4511               if (CONSTANT_POOL_ADDRESS_P (sym))
4512                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4513               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4514                 {
4515                   tree exp = SYMBOL_REF_DECL (sym);
4516                   align = TYPE_ALIGN (TREE_TYPE (exp));
4517                   align = CONSTANT_ALIGNMENT (exp, align);
4518                 }
4519               else if (SYMBOL_REF_DECL (sym))
4520                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4521               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4522                        && SYMBOL_REF_BLOCK (sym) != NULL)
4523                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4524               else
4525                 align = BITS_PER_UNIT;
4526
4527               ref_size = GET_MODE_SIZE (mode);
4528               if (ref_size == 0)
4529                 ref_size = GET_MODE_SIZE (DImode);
4530
4531               return ((INTVAL (offs) & (ref_size - 1)) == 0
4532                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4533             }
4534         }
4535       return false;
4536
4537     default:
4538       return false;
4539     }
4540 }
4541
4542 bool
4543 aarch64_symbolic_address_p (rtx x)
4544 {
4545   rtx offset;
4546
4547   split_const (x, &x, &offset);
4548   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4549 }
4550
4551 /* Classify the base of symbolic expression X.  */
4552
4553 enum aarch64_symbol_type
4554 aarch64_classify_symbolic_expression (rtx x)
4555 {
4556   rtx offset;
4557
4558   split_const (x, &x, &offset);
4559   return aarch64_classify_symbol (x, offset);
4560 }
4561
4562
4563 /* Return TRUE if X is a legitimate address for accessing memory in
4564    mode MODE.  */
4565 static bool
4566 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4567 {
4568   struct aarch64_address_info addr;
4569
4570   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4571 }
4572
4573 /* Return TRUE if X is a legitimate address for accessing memory in
4574    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4575    pair operation.  */
4576 bool
4577 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4578                               RTX_CODE outer_code, bool strict_p)
4579 {
4580   struct aarch64_address_info addr;
4581
4582   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4583 }
4584
4585 /* Split an out-of-range address displacement into a base and offset.
4586    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4587    to increase opportunities for sharing the base address of different sizes.
4588    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4589 static bool
4590 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4591 {
4592   HOST_WIDE_INT offset = INTVAL (*disp);
4593   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4594
4595   if (mode == TImode || mode == TFmode
4596       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4597     base = (offset + 0x100) & ~0x1ff;
4598
4599   *off = GEN_INT (base);
4600   *disp = GEN_INT (offset - base);
4601   return true;
4602 }
4603
4604 /* Return TRUE if rtx X is immediate constant 0.0 */
4605 bool
4606 aarch64_float_const_zero_rtx_p (rtx x)
4607 {
4608   if (GET_MODE (x) == VOIDmode)
4609     return false;
4610
4611   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4612     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4613   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4614 }
4615
4616 /* Return the fixed registers used for condition codes.  */
4617
4618 static bool
4619 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4620 {
4621   *p1 = CC_REGNUM;
4622   *p2 = INVALID_REGNUM;
4623   return true;
4624 }
4625
4626 /* Emit call insn with PAT and do aarch64-specific handling.  */
4627
4628 void
4629 aarch64_emit_call_insn (rtx pat)
4630 {
4631   rtx insn = emit_call_insn (pat);
4632
4633   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4634   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4635   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4636 }
4637
4638 machine_mode
4639 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4640 {
4641   /* All floating point compares return CCFP if it is an equality
4642      comparison, and CCFPE otherwise.  */
4643   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4644     {
4645       switch (code)
4646         {
4647         case EQ:
4648         case NE:
4649         case UNORDERED:
4650         case ORDERED:
4651         case UNLT:
4652         case UNLE:
4653         case UNGT:
4654         case UNGE:
4655         case UNEQ:
4656         case LTGT:
4657           return CCFPmode;
4658
4659         case LT:
4660         case LE:
4661         case GT:
4662         case GE:
4663           return CCFPEmode;
4664
4665         default:
4666           gcc_unreachable ();
4667         }
4668     }
4669
4670   /* Equality comparisons of short modes against zero can be performed
4671      using the TST instruction with the appropriate bitmask.  */
4672   if (y == const0_rtx && REG_P (x)
4673       && (code == EQ || code == NE)
4674       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4675     return CC_NZmode;
4676
4677   /* Similarly, comparisons of zero_extends from shorter modes can
4678      be performed using an ANDS with an immediate mask.  */
4679   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4680       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4681       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4682       && (code == EQ || code == NE))
4683     return CC_NZmode;
4684
4685   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4686       && y == const0_rtx
4687       && (code == EQ || code == NE || code == LT || code == GE)
4688       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4689           || GET_CODE (x) == NEG
4690           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4691               && CONST_INT_P (XEXP (x, 2)))))
4692     return CC_NZmode;
4693
4694   /* A compare with a shifted operand.  Because of canonicalization,
4695      the comparison will have to be swapped when we emit the assembly
4696      code.  */
4697   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4698       && (REG_P (y) || GET_CODE (y) == SUBREG)
4699       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4700           || GET_CODE (x) == LSHIFTRT
4701           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4702     return CC_SWPmode;
4703
4704   /* Similarly for a negated operand, but we can only do this for
4705      equalities.  */
4706   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4707       && (REG_P (y) || GET_CODE (y) == SUBREG)
4708       && (code == EQ || code == NE)
4709       && GET_CODE (x) == NEG)
4710     return CC_Zmode;
4711
4712   /* A test for unsigned overflow.  */
4713   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4714       && code == NE
4715       && GET_CODE (x) == PLUS
4716       && GET_CODE (y) == ZERO_EXTEND)
4717     return CC_Cmode;
4718
4719   /* For everything else, return CCmode.  */
4720   return CCmode;
4721 }
4722
4723 static int
4724 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4725
4726 int
4727 aarch64_get_condition_code (rtx x)
4728 {
4729   machine_mode mode = GET_MODE (XEXP (x, 0));
4730   enum rtx_code comp_code = GET_CODE (x);
4731
4732   if (GET_MODE_CLASS (mode) != MODE_CC)
4733     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4734   return aarch64_get_condition_code_1 (mode, comp_code);
4735 }
4736
4737 static int
4738 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4739 {
4740   switch (mode)
4741     {
4742     case CCFPmode:
4743     case CCFPEmode:
4744       switch (comp_code)
4745         {
4746         case GE: return AARCH64_GE;
4747         case GT: return AARCH64_GT;
4748         case LE: return AARCH64_LS;
4749         case LT: return AARCH64_MI;
4750         case NE: return AARCH64_NE;
4751         case EQ: return AARCH64_EQ;
4752         case ORDERED: return AARCH64_VC;
4753         case UNORDERED: return AARCH64_VS;
4754         case UNLT: return AARCH64_LT;
4755         case UNLE: return AARCH64_LE;
4756         case UNGT: return AARCH64_HI;
4757         case UNGE: return AARCH64_PL;
4758         default: return -1;
4759         }
4760       break;
4761
4762     case CCmode:
4763       switch (comp_code)
4764         {
4765         case NE: return AARCH64_NE;
4766         case EQ: return AARCH64_EQ;
4767         case GE: return AARCH64_GE;
4768         case GT: return AARCH64_GT;
4769         case LE: return AARCH64_LE;
4770         case LT: return AARCH64_LT;
4771         case GEU: return AARCH64_CS;
4772         case GTU: return AARCH64_HI;
4773         case LEU: return AARCH64_LS;
4774         case LTU: return AARCH64_CC;
4775         default: return -1;
4776         }
4777       break;
4778
4779     case CC_SWPmode:
4780       switch (comp_code)
4781         {
4782         case NE: return AARCH64_NE;
4783         case EQ: return AARCH64_EQ;
4784         case GE: return AARCH64_LE;
4785         case GT: return AARCH64_LT;
4786         case LE: return AARCH64_GE;
4787         case LT: return AARCH64_GT;
4788         case GEU: return AARCH64_LS;
4789         case GTU: return AARCH64_CC;
4790         case LEU: return AARCH64_CS;
4791         case LTU: return AARCH64_HI;
4792         default: return -1;
4793         }
4794       break;
4795
4796     case CC_NZmode:
4797       switch (comp_code)
4798         {
4799         case NE: return AARCH64_NE;
4800         case EQ: return AARCH64_EQ;
4801         case GE: return AARCH64_PL;
4802         case LT: return AARCH64_MI;
4803         default: return -1;
4804         }
4805       break;
4806
4807     case CC_Zmode:
4808       switch (comp_code)
4809         {
4810         case NE: return AARCH64_NE;
4811         case EQ: return AARCH64_EQ;
4812         default: return -1;
4813         }
4814       break;
4815
4816     case CC_Cmode:
4817       switch (comp_code)
4818         {
4819         case NE: return AARCH64_CS;
4820         case EQ: return AARCH64_CC;
4821         default: return -1;
4822         }
4823       break;
4824
4825     default:
4826       return -1;
4827     }
4828
4829   return -1;
4830 }
4831
4832 bool
4833 aarch64_const_vec_all_same_in_range_p (rtx x,
4834                                   HOST_WIDE_INT minval,
4835                                   HOST_WIDE_INT maxval)
4836 {
4837   HOST_WIDE_INT firstval;
4838   int count, i;
4839
4840   if (GET_CODE (x) != CONST_VECTOR
4841       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4842     return false;
4843
4844   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4845   if (firstval < minval || firstval > maxval)
4846     return false;
4847
4848   count = CONST_VECTOR_NUNITS (x);
4849   for (i = 1; i < count; i++)
4850     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4851       return false;
4852
4853   return true;
4854 }
4855
4856 bool
4857 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4858 {
4859   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4860 }
4861
4862
4863 /* N Z C V.  */
4864 #define AARCH64_CC_V 1
4865 #define AARCH64_CC_C (1 << 1)
4866 #define AARCH64_CC_Z (1 << 2)
4867 #define AARCH64_CC_N (1 << 3)
4868
4869 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4870 static const int aarch64_nzcv_codes[] =
4871 {
4872   0,            /* EQ, Z == 1.  */
4873   AARCH64_CC_Z, /* NE, Z == 0.  */
4874   0,            /* CS, C == 1.  */
4875   AARCH64_CC_C, /* CC, C == 0.  */
4876   0,            /* MI, N == 1.  */
4877   AARCH64_CC_N, /* PL, N == 0.  */
4878   0,            /* VS, V == 1.  */
4879   AARCH64_CC_V, /* VC, V == 0.  */
4880   0,            /* HI, C ==1 && Z == 0.  */
4881   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4882   AARCH64_CC_V, /* GE, N == V.  */
4883   0,            /* LT, N != V.  */
4884   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4885   0,            /* LE, !(Z == 0 && N == V).  */
4886   0,            /* AL, Any.  */
4887   0             /* NV, Any.  */
4888 };
4889
4890 static void
4891 aarch64_print_operand (FILE *f, rtx x, int code)
4892 {
4893   switch (code)
4894     {
4895     /* An integer or symbol address without a preceding # sign.  */
4896     case 'c':
4897       switch (GET_CODE (x))
4898         {
4899         case CONST_INT:
4900           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4901           break;
4902
4903         case SYMBOL_REF:
4904           output_addr_const (f, x);
4905           break;
4906
4907         case CONST:
4908           if (GET_CODE (XEXP (x, 0)) == PLUS
4909               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4910             {
4911               output_addr_const (f, x);
4912               break;
4913             }
4914           /* Fall through.  */
4915
4916         default:
4917           output_operand_lossage ("Unsupported operand for code '%c'", code);
4918         }
4919       break;
4920
4921     case 'e':
4922       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4923       {
4924         int n;
4925
4926         if (!CONST_INT_P (x)
4927             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4928           {
4929             output_operand_lossage ("invalid operand for '%%%c'", code);
4930             return;
4931           }
4932
4933         switch (n)
4934           {
4935           case 3:
4936             fputc ('b', f);
4937             break;
4938           case 4:
4939             fputc ('h', f);
4940             break;
4941           case 5:
4942             fputc ('w', f);
4943             break;
4944           default:
4945             output_operand_lossage ("invalid operand for '%%%c'", code);
4946             return;
4947           }
4948       }
4949       break;
4950
4951     case 'p':
4952       {
4953         int n;
4954
4955         /* Print N such that 2^N == X.  */
4956         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4957           {
4958             output_operand_lossage ("invalid operand for '%%%c'", code);
4959             return;
4960           }
4961
4962         asm_fprintf (f, "%d", n);
4963       }
4964       break;
4965
4966     case 'P':
4967       /* Print the number of non-zero bits in X (a const_int).  */
4968       if (!CONST_INT_P (x))
4969         {
4970           output_operand_lossage ("invalid operand for '%%%c'", code);
4971           return;
4972         }
4973
4974       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4975       break;
4976
4977     case 'H':
4978       /* Print the higher numbered register of a pair (TImode) of regs.  */
4979       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4980         {
4981           output_operand_lossage ("invalid operand for '%%%c'", code);
4982           return;
4983         }
4984
4985       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4986       break;
4987
4988     case 'M':
4989     case 'm':
4990       {
4991         int cond_code;
4992         /* Print a condition (eq, ne, etc) or its inverse.  */
4993
4994         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
4995         if (x == const_true_rtx)
4996           {
4997             if (code == 'M')
4998               fputs ("nv", f);
4999             return;
5000           }
5001
5002         if (!COMPARISON_P (x))
5003           {
5004             output_operand_lossage ("invalid operand for '%%%c'", code);
5005             return;
5006           }
5007
5008         cond_code = aarch64_get_condition_code (x);
5009         gcc_assert (cond_code >= 0);
5010         if (code == 'M')
5011           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5012         fputs (aarch64_condition_codes[cond_code], f);
5013       }
5014       break;
5015
5016     case 'b':
5017     case 'h':
5018     case 's':
5019     case 'd':
5020     case 'q':
5021       /* Print a scalar FP/SIMD register name.  */
5022       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5023         {
5024           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5025           return;
5026         }
5027       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5028       break;
5029
5030     case 'S':
5031     case 'T':
5032     case 'U':
5033     case 'V':
5034       /* Print the first FP/SIMD register name in a list.  */
5035       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5036         {
5037           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5038           return;
5039         }
5040       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5041       break;
5042
5043     case 'R':
5044       /* Print a scalar FP/SIMD register name + 1.  */
5045       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5046         {
5047           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5048           return;
5049         }
5050       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5051       break;
5052
5053     case 'X':
5054       /* Print bottom 16 bits of integer constant in hex.  */
5055       if (!CONST_INT_P (x))
5056         {
5057           output_operand_lossage ("invalid operand for '%%%c'", code);
5058           return;
5059         }
5060       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5061       break;
5062
5063     case 'w':
5064     case 'x':
5065       /* Print a general register name or the zero register (32-bit or
5066          64-bit).  */
5067       if (x == const0_rtx
5068           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5069         {
5070           asm_fprintf (f, "%czr", code);
5071           break;
5072         }
5073
5074       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5075         {
5076           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5077           break;
5078         }
5079
5080       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5081         {
5082           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5083           break;
5084         }
5085
5086       /* Fall through */
5087
5088     case 0:
5089       /* Print a normal operand, if it's a general register, then we
5090          assume DImode.  */
5091       if (x == NULL)
5092         {
5093           output_operand_lossage ("missing operand");
5094           return;
5095         }
5096
5097       switch (GET_CODE (x))
5098         {
5099         case REG:
5100           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5101           break;
5102
5103         case MEM:
5104           output_address (GET_MODE (x), XEXP (x, 0));
5105           break;
5106
5107         case CONST:
5108         case LABEL_REF:
5109         case SYMBOL_REF:
5110           output_addr_const (asm_out_file, x);
5111           break;
5112
5113         case CONST_INT:
5114           asm_fprintf (f, "%wd", INTVAL (x));
5115           break;
5116
5117         case CONST_VECTOR:
5118           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5119             {
5120               gcc_assert (
5121                   aarch64_const_vec_all_same_in_range_p (x,
5122                                                          HOST_WIDE_INT_MIN,
5123                                                          HOST_WIDE_INT_MAX));
5124               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5125             }
5126           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5127             {
5128               fputc ('0', f);
5129             }
5130           else
5131             gcc_unreachable ();
5132           break;
5133
5134         case CONST_DOUBLE:
5135           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5136              be getting CONST_DOUBLEs holding integers.  */
5137           gcc_assert (GET_MODE (x) != VOIDmode);
5138           if (aarch64_float_const_zero_rtx_p (x))
5139             {
5140               fputc ('0', f);
5141               break;
5142             }
5143           else if (aarch64_float_const_representable_p (x))
5144             {
5145 #define buf_size 20
5146               char float_buf[buf_size] = {'\0'};
5147               real_to_decimal_for_mode (float_buf,
5148                                         CONST_DOUBLE_REAL_VALUE (x),
5149                                         buf_size, buf_size,
5150                                         1, GET_MODE (x));
5151               asm_fprintf (asm_out_file, "%s", float_buf);
5152               break;
5153 #undef buf_size
5154             }
5155           output_operand_lossage ("invalid constant");
5156           return;
5157         default:
5158           output_operand_lossage ("invalid operand");
5159           return;
5160         }
5161       break;
5162
5163     case 'A':
5164       if (GET_CODE (x) == HIGH)
5165         x = XEXP (x, 0);
5166
5167       switch (aarch64_classify_symbolic_expression (x))
5168         {
5169         case SYMBOL_SMALL_GOT_4G:
5170           asm_fprintf (asm_out_file, ":got:");
5171           break;
5172
5173         case SYMBOL_SMALL_TLSGD:
5174           asm_fprintf (asm_out_file, ":tlsgd:");
5175           break;
5176
5177         case SYMBOL_SMALL_TLSDESC:
5178           asm_fprintf (asm_out_file, ":tlsdesc:");
5179           break;
5180
5181         case SYMBOL_SMALL_TLSIE:
5182           asm_fprintf (asm_out_file, ":gottprel:");
5183           break;
5184
5185         case SYMBOL_TLSLE24:
5186           asm_fprintf (asm_out_file, ":tprel:");
5187           break;
5188
5189         case SYMBOL_TINY_GOT:
5190           gcc_unreachable ();
5191           break;
5192
5193         default:
5194           break;
5195         }
5196       output_addr_const (asm_out_file, x);
5197       break;
5198
5199     case 'L':
5200       switch (aarch64_classify_symbolic_expression (x))
5201         {
5202         case SYMBOL_SMALL_GOT_4G:
5203           asm_fprintf (asm_out_file, ":lo12:");
5204           break;
5205
5206         case SYMBOL_SMALL_TLSGD:
5207           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5208           break;
5209
5210         case SYMBOL_SMALL_TLSDESC:
5211           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5212           break;
5213
5214         case SYMBOL_SMALL_TLSIE:
5215           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5216           break;
5217
5218         case SYMBOL_TLSLE12:
5219           asm_fprintf (asm_out_file, ":tprel_lo12:");
5220           break;
5221
5222         case SYMBOL_TLSLE24:
5223           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5224           break;
5225
5226         case SYMBOL_TINY_GOT:
5227           asm_fprintf (asm_out_file, ":got:");
5228           break;
5229
5230         case SYMBOL_TINY_TLSIE:
5231           asm_fprintf (asm_out_file, ":gottprel:");
5232           break;
5233
5234         default:
5235           break;
5236         }
5237       output_addr_const (asm_out_file, x);
5238       break;
5239
5240     case 'G':
5241
5242       switch (aarch64_classify_symbolic_expression (x))
5243         {
5244         case SYMBOL_TLSLE24:
5245           asm_fprintf (asm_out_file, ":tprel_hi12:");
5246           break;
5247         default:
5248           break;
5249         }
5250       output_addr_const (asm_out_file, x);
5251       break;
5252
5253     case 'k':
5254       {
5255         HOST_WIDE_INT cond_code;
5256         /* Print nzcv.  */
5257
5258         if (!CONST_INT_P (x))
5259           {
5260             output_operand_lossage ("invalid operand for '%%%c'", code);
5261             return;
5262           }
5263
5264         cond_code = INTVAL (x);
5265         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5266         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5267       }
5268       break;
5269
5270     default:
5271       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5272       return;
5273     }
5274 }
5275
5276 static void
5277 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5278 {
5279   struct aarch64_address_info addr;
5280
5281   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5282     switch (addr.type)
5283       {
5284       case ADDRESS_REG_IMM:
5285         if (addr.offset == const0_rtx)
5286           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5287         else
5288           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5289                        INTVAL (addr.offset));
5290         return;
5291
5292       case ADDRESS_REG_REG:
5293         if (addr.shift == 0)
5294           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5295                        reg_names [REGNO (addr.offset)]);
5296         else
5297           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5298                        reg_names [REGNO (addr.offset)], addr.shift);
5299         return;
5300
5301       case ADDRESS_REG_UXTW:
5302         if (addr.shift == 0)
5303           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5304                        REGNO (addr.offset) - R0_REGNUM);
5305         else
5306           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5307                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5308         return;
5309
5310       case ADDRESS_REG_SXTW:
5311         if (addr.shift == 0)
5312           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5313                        REGNO (addr.offset) - R0_REGNUM);
5314         else
5315           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5316                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5317         return;
5318
5319       case ADDRESS_REG_WB:
5320         switch (GET_CODE (x))
5321           {
5322           case PRE_INC:
5323             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5324                          GET_MODE_SIZE (mode));
5325             return;
5326           case POST_INC:
5327             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5328                          GET_MODE_SIZE (mode));
5329             return;
5330           case PRE_DEC:
5331             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5332                          GET_MODE_SIZE (mode));
5333             return;
5334           case POST_DEC:
5335             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5336                          GET_MODE_SIZE (mode));
5337             return;
5338           case PRE_MODIFY:
5339             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5340                          INTVAL (addr.offset));
5341             return;
5342           case POST_MODIFY:
5343             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5344                          INTVAL (addr.offset));
5345             return;
5346           default:
5347             break;
5348           }
5349         break;
5350
5351       case ADDRESS_LO_SUM:
5352         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5353         output_addr_const (f, addr.offset);
5354         asm_fprintf (f, "]");
5355         return;
5356
5357       case ADDRESS_SYMBOLIC:
5358         break;
5359       }
5360
5361   output_addr_const (f, x);
5362 }
5363
5364 bool
5365 aarch64_label_mentioned_p (rtx x)
5366 {
5367   const char *fmt;
5368   int i;
5369
5370   if (GET_CODE (x) == LABEL_REF)
5371     return true;
5372
5373   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5374      referencing instruction, but they are constant offsets, not
5375      symbols.  */
5376   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5377     return false;
5378
5379   fmt = GET_RTX_FORMAT (GET_CODE (x));
5380   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5381     {
5382       if (fmt[i] == 'E')
5383         {
5384           int j;
5385
5386           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5387             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5388               return 1;
5389         }
5390       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5391         return 1;
5392     }
5393
5394   return 0;
5395 }
5396
5397 /* Implement REGNO_REG_CLASS.  */
5398
5399 enum reg_class
5400 aarch64_regno_regclass (unsigned regno)
5401 {
5402   if (GP_REGNUM_P (regno))
5403     return GENERAL_REGS;
5404
5405   if (regno == SP_REGNUM)
5406     return STACK_REG;
5407
5408   if (regno == FRAME_POINTER_REGNUM
5409       || regno == ARG_POINTER_REGNUM)
5410     return POINTER_REGS;
5411
5412   if (FP_REGNUM_P (regno))
5413     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5414
5415   return NO_REGS;
5416 }
5417
5418 static rtx
5419 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5420 {
5421   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5422      where mask is selected by alignment and size of the offset.
5423      We try to pick as large a range for the offset as possible to
5424      maximize the chance of a CSE.  However, for aligned addresses
5425      we limit the range to 4k so that structures with different sized
5426      elements are likely to use the same base.  We need to be careful
5427      not to split a CONST for some forms of address expression, otherwise
5428      it will generate sub-optimal code.  */
5429
5430   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5431     {
5432       rtx base = XEXP (x, 0);
5433       rtx offset_rtx = XEXP (x, 1);
5434       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5435
5436       if (GET_CODE (base) == PLUS)
5437         {
5438           rtx op0 = XEXP (base, 0);
5439           rtx op1 = XEXP (base, 1);
5440
5441           /* Force any scaling into a temp for CSE.  */
5442           op0 = force_reg (Pmode, op0);
5443           op1 = force_reg (Pmode, op1);
5444
5445           /* Let the pointer register be in op0.  */
5446           if (REG_POINTER (op1))
5447             std::swap (op0, op1);
5448
5449           /* If the pointer is virtual or frame related, then we know that
5450              virtual register instantiation or register elimination is going
5451              to apply a second constant.  We want the two constants folded
5452              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5453           if (virt_or_elim_regno_p (REGNO (op0)))
5454             {
5455               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5456                                    NULL_RTX, true, OPTAB_DIRECT);
5457               return gen_rtx_PLUS (Pmode, base, op1);
5458             }
5459
5460           /* Otherwise, in order to encourage CSE (and thence loop strength
5461              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5462           base = expand_binop (Pmode, add_optab, op0, op1,
5463                                NULL_RTX, true, OPTAB_DIRECT);
5464           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5465         }
5466
5467       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5468       HOST_WIDE_INT base_offset;
5469       if (GET_MODE_SIZE (mode) > 16)
5470         base_offset = (offset + 0x400) & ~0x7f0;
5471       /* For offsets aren't a multiple of the access size, the limit is
5472          -256...255.  */
5473       else if (offset & (GET_MODE_SIZE (mode) - 1))
5474         {
5475           base_offset = (offset + 0x100) & ~0x1ff;
5476
5477           /* BLKmode typically uses LDP of X-registers.  */
5478           if (mode == BLKmode)
5479             base_offset = (offset + 512) & ~0x3ff;
5480         }
5481       /* Small negative offsets are supported.  */
5482       else if (IN_RANGE (offset, -256, 0))
5483         base_offset = 0;
5484       else if (mode == TImode || mode == TFmode)
5485         base_offset = (offset + 0x100) & ~0x1ff;
5486       /* Use 12-bit offset by access size.  */
5487       else
5488         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5489
5490       if (base_offset != 0)
5491         {
5492           base = plus_constant (Pmode, base, base_offset);
5493           base = force_operand (base, NULL_RTX);
5494           return plus_constant (Pmode, base, offset - base_offset);
5495         }
5496     }
5497
5498   return x;
5499 }
5500
5501 /* Return the reload icode required for a constant pool in mode.  */
5502 static enum insn_code
5503 aarch64_constant_pool_reload_icode (machine_mode mode)
5504 {
5505   switch (mode)
5506     {
5507     case SFmode:
5508       return CODE_FOR_aarch64_reload_movcpsfdi;
5509
5510     case DFmode:
5511       return CODE_FOR_aarch64_reload_movcpdfdi;
5512
5513     case TFmode:
5514       return CODE_FOR_aarch64_reload_movcptfdi;
5515
5516     case V8QImode:
5517       return CODE_FOR_aarch64_reload_movcpv8qidi;
5518
5519     case V16QImode:
5520       return CODE_FOR_aarch64_reload_movcpv16qidi;
5521
5522     case V4HImode:
5523       return CODE_FOR_aarch64_reload_movcpv4hidi;
5524
5525     case V8HImode:
5526       return CODE_FOR_aarch64_reload_movcpv8hidi;
5527
5528     case V2SImode:
5529       return CODE_FOR_aarch64_reload_movcpv2sidi;
5530
5531     case V4SImode:
5532       return CODE_FOR_aarch64_reload_movcpv4sidi;
5533
5534     case V2DImode:
5535       return CODE_FOR_aarch64_reload_movcpv2didi;
5536
5537     case V2DFmode:
5538       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5539
5540     default:
5541       gcc_unreachable ();
5542     }
5543
5544   gcc_unreachable ();
5545 }
5546 static reg_class_t
5547 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5548                           reg_class_t rclass,
5549                           machine_mode mode,
5550                           secondary_reload_info *sri)
5551 {
5552
5553   /* If we have to disable direct literal pool loads and stores because the
5554      function is too big, then we need a scratch register.  */
5555   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5556       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5557           || targetm.vector_mode_supported_p (GET_MODE (x)))
5558       && !aarch64_pcrelative_literal_loads)
5559     {
5560       sri->icode = aarch64_constant_pool_reload_icode (mode);
5561       return NO_REGS;
5562     }
5563
5564   /* Without the TARGET_SIMD instructions we cannot move a Q register
5565      to a Q register directly.  We need a scratch.  */
5566   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5567       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5568       && reg_class_subset_p (rclass, FP_REGS))
5569     {
5570       if (mode == TFmode)
5571         sri->icode = CODE_FOR_aarch64_reload_movtf;
5572       else if (mode == TImode)
5573         sri->icode = CODE_FOR_aarch64_reload_movti;
5574       return NO_REGS;
5575     }
5576
5577   /* A TFmode or TImode memory access should be handled via an FP_REGS
5578      because AArch64 has richer addressing modes for LDR/STR instructions
5579      than LDP/STP instructions.  */
5580   if (TARGET_FLOAT && rclass == GENERAL_REGS
5581       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5582     return FP_REGS;
5583
5584   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5585       return GENERAL_REGS;
5586
5587   return NO_REGS;
5588 }
5589
5590 static bool
5591 aarch64_can_eliminate (const int from, const int to)
5592 {
5593   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5594      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5595
5596   if (frame_pointer_needed)
5597     {
5598       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5599         return true;
5600       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5601         return false;
5602       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5603           && !cfun->calls_alloca)
5604         return true;
5605       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5606         return true;
5607
5608       return false;
5609     }
5610   else
5611     {
5612       /* If we decided that we didn't need a leaf frame pointer but then used
5613          LR in the function, then we'll want a frame pointer after all, so
5614          prevent this elimination to ensure a frame pointer is used.  */
5615       if (to == STACK_POINTER_REGNUM
5616           && flag_omit_leaf_frame_pointer
5617           && df_regs_ever_live_p (LR_REGNUM))
5618         return false;
5619     }
5620
5621   return true;
5622 }
5623
5624 HOST_WIDE_INT
5625 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5626 {
5627   aarch64_layout_frame ();
5628
5629   if (to == HARD_FRAME_POINTER_REGNUM)
5630     {
5631       if (from == ARG_POINTER_REGNUM)
5632         return cfun->machine->frame.hard_fp_offset;
5633
5634       if (from == FRAME_POINTER_REGNUM)
5635         return cfun->machine->frame.hard_fp_offset
5636                - cfun->machine->frame.locals_offset;
5637     }
5638
5639   if (to == STACK_POINTER_REGNUM)
5640     {
5641       if (from == FRAME_POINTER_REGNUM)
5642           return cfun->machine->frame.frame_size
5643                  - cfun->machine->frame.locals_offset;
5644     }
5645
5646   return cfun->machine->frame.frame_size;
5647 }
5648
5649 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5650    previous frame.  */
5651
5652 rtx
5653 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5654 {
5655   if (count != 0)
5656     return const0_rtx;
5657   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5658 }
5659
5660
5661 static void
5662 aarch64_asm_trampoline_template (FILE *f)
5663 {
5664   if (TARGET_ILP32)
5665     {
5666       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5667       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5668     }
5669   else
5670     {
5671       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5672       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5673     }
5674   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5675   assemble_aligned_integer (4, const0_rtx);
5676   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5677   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5678 }
5679
5680 static void
5681 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5682 {
5683   rtx fnaddr, mem, a_tramp;
5684   const int tramp_code_sz = 16;
5685
5686   /* Don't need to copy the trailing D-words, we fill those in below.  */
5687   emit_block_move (m_tramp, assemble_trampoline_template (),
5688                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5689   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5690   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5691   if (GET_MODE (fnaddr) != ptr_mode)
5692     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5693   emit_move_insn (mem, fnaddr);
5694
5695   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5696   emit_move_insn (mem, chain_value);
5697
5698   /* XXX We should really define a "clear_cache" pattern and use
5699      gen_clear_cache().  */
5700   a_tramp = XEXP (m_tramp, 0);
5701   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5702                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5703                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5704                      ptr_mode);
5705 }
5706
5707 static unsigned char
5708 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5709 {
5710   switch (regclass)
5711     {
5712     case CALLER_SAVE_REGS:
5713     case POINTER_REGS:
5714     case GENERAL_REGS:
5715     case ALL_REGS:
5716     case FP_REGS:
5717     case FP_LO_REGS:
5718       return
5719         aarch64_vector_mode_p (mode)
5720           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5721           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5722     case STACK_REG:
5723       return 1;
5724
5725     case NO_REGS:
5726       return 0;
5727
5728     default:
5729       break;
5730     }
5731   gcc_unreachable ();
5732 }
5733
5734 static reg_class_t
5735 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5736 {
5737   if (regclass == POINTER_REGS)
5738     return GENERAL_REGS;
5739
5740   if (regclass == STACK_REG)
5741     {
5742       if (REG_P(x)
5743           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5744           return regclass;
5745
5746       return NO_REGS;
5747     }
5748
5749   /* If it's an integer immediate that MOVI can't handle, then
5750      FP_REGS is not an option, so we return NO_REGS instead.  */
5751   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5752       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5753     return NO_REGS;
5754
5755   /* Register eliminiation can result in a request for
5756      SP+constant->FP_REGS.  We cannot support such operations which
5757      use SP as source and an FP_REG as destination, so reject out
5758      right now.  */
5759   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5760     {
5761       rtx lhs = XEXP (x, 0);
5762
5763       /* Look through a possible SUBREG introduced by ILP32.  */
5764       if (GET_CODE (lhs) == SUBREG)
5765         lhs = SUBREG_REG (lhs);
5766
5767       gcc_assert (REG_P (lhs));
5768       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5769                                       POINTER_REGS));
5770       return NO_REGS;
5771     }
5772
5773   return regclass;
5774 }
5775
5776 void
5777 aarch64_asm_output_labelref (FILE* f, const char *name)
5778 {
5779   asm_fprintf (f, "%U%s", name);
5780 }
5781
5782 static void
5783 aarch64_elf_asm_constructor (rtx symbol, int priority)
5784 {
5785   if (priority == DEFAULT_INIT_PRIORITY)
5786     default_ctor_section_asm_out_constructor (symbol, priority);
5787   else
5788     {
5789       section *s;
5790       /* While priority is known to be in range [0, 65535], so 18 bytes
5791          would be enough, the compiler might not know that.  To avoid
5792          -Wformat-truncation false positive, use a larger size.  */
5793       char buf[23];
5794       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5795       s = get_section (buf, SECTION_WRITE, NULL);
5796       switch_to_section (s);
5797       assemble_align (POINTER_SIZE);
5798       assemble_aligned_integer (POINTER_BYTES, symbol);
5799     }
5800 }
5801
5802 static void
5803 aarch64_elf_asm_destructor (rtx symbol, int priority)
5804 {
5805   if (priority == DEFAULT_INIT_PRIORITY)
5806     default_dtor_section_asm_out_destructor (symbol, priority);
5807   else
5808     {
5809       section *s;
5810       /* While priority is known to be in range [0, 65535], so 18 bytes
5811          would be enough, the compiler might not know that.  To avoid
5812          -Wformat-truncation false positive, use a larger size.  */
5813       char buf[23];
5814       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5815       s = get_section (buf, SECTION_WRITE, NULL);
5816       switch_to_section (s);
5817       assemble_align (POINTER_SIZE);
5818       assemble_aligned_integer (POINTER_BYTES, symbol);
5819     }
5820 }
5821
5822 const char*
5823 aarch64_output_casesi (rtx *operands)
5824 {
5825   char buf[100];
5826   char label[100];
5827   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5828   int index;
5829   static const char *const patterns[4][2] =
5830   {
5831     {
5832       "ldrb\t%w3, [%0,%w1,uxtw]",
5833       "add\t%3, %4, %w3, sxtb #2"
5834     },
5835     {
5836       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5837       "add\t%3, %4, %w3, sxth #2"
5838     },
5839     {
5840       "ldr\t%w3, [%0,%w1,uxtw #2]",
5841       "add\t%3, %4, %w3, sxtw #2"
5842     },
5843     /* We assume that DImode is only generated when not optimizing and
5844        that we don't really need 64-bit address offsets.  That would
5845        imply an object file with 8GB of code in a single function!  */
5846     {
5847       "ldr\t%w3, [%0,%w1,uxtw #2]",
5848       "add\t%3, %4, %w3, sxtw #2"
5849     }
5850   };
5851
5852   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5853
5854   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5855
5856   gcc_assert (index >= 0 && index <= 3);
5857
5858   /* Need to implement table size reduction, by chaning the code below.  */
5859   output_asm_insn (patterns[index][0], operands);
5860   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5861   snprintf (buf, sizeof (buf),
5862             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5863   output_asm_insn (buf, operands);
5864   output_asm_insn (patterns[index][1], operands);
5865   output_asm_insn ("br\t%3", operands);
5866   assemble_label (asm_out_file, label);
5867   return "";
5868 }
5869
5870
5871 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5872    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5873    operator.  */
5874
5875 int
5876 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5877 {
5878   if (shift >= 0 && shift <= 3)
5879     {
5880       int size;
5881       for (size = 8; size <= 32; size *= 2)
5882         {
5883           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5884           if (mask == bits << shift)
5885             return size;
5886         }
5887     }
5888   return 0;
5889 }
5890
5891 /* Constant pools are per function only when PC relative
5892    literal loads are true or we are in the large memory
5893    model.  */
5894
5895 static inline bool
5896 aarch64_can_use_per_function_literal_pools_p (void)
5897 {
5898   return (aarch64_pcrelative_literal_loads
5899           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5900 }
5901
5902 static bool
5903 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5904 {
5905   /* Fixme:: In an ideal world this would work similar
5906      to the logic in aarch64_select_rtx_section but this
5907      breaks bootstrap in gcc go.  For now we workaround
5908      this by returning false here.  */
5909   return false;
5910 }
5911
5912 /* Select appropriate section for constants depending
5913    on where we place literal pools.  */
5914
5915 static section *
5916 aarch64_select_rtx_section (machine_mode mode,
5917                             rtx x,
5918                             unsigned HOST_WIDE_INT align)
5919 {
5920   if (aarch64_can_use_per_function_literal_pools_p ())
5921     return function_section (current_function_decl);
5922
5923   return default_elf_select_rtx_section (mode, x, align);
5924 }
5925
5926 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5927 void
5928 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5929                                   HOST_WIDE_INT offset)
5930 {
5931   /* When using per-function literal pools, we must ensure that any code
5932      section is aligned to the minimal instruction length, lest we get
5933      errors from the assembler re "unaligned instructions".  */
5934   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5935     ASM_OUTPUT_ALIGN (f, 2);
5936 }
5937
5938 /* Costs.  */
5939
5940 /* Helper function for rtx cost calculation.  Strip a shift expression
5941    from X.  Returns the inner operand if successful, or the original
5942    expression on failure.  */
5943 static rtx
5944 aarch64_strip_shift (rtx x)
5945 {
5946   rtx op = x;
5947
5948   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5949      we can convert both to ROR during final output.  */
5950   if ((GET_CODE (op) == ASHIFT
5951        || GET_CODE (op) == ASHIFTRT
5952        || GET_CODE (op) == LSHIFTRT
5953        || GET_CODE (op) == ROTATERT
5954        || GET_CODE (op) == ROTATE)
5955       && CONST_INT_P (XEXP (op, 1)))
5956     return XEXP (op, 0);
5957
5958   if (GET_CODE (op) == MULT
5959       && CONST_INT_P (XEXP (op, 1))
5960       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5961     return XEXP (op, 0);
5962
5963   return x;
5964 }
5965
5966 /* Helper function for rtx cost calculation.  Strip an extend
5967    expression from X.  Returns the inner operand if successful, or the
5968    original expression on failure.  We deal with a number of possible
5969    canonicalization variations here.  */
5970 static rtx
5971 aarch64_strip_extend (rtx x)
5972 {
5973   rtx op = x;
5974
5975   /* Zero and sign extraction of a widened value.  */
5976   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5977       && XEXP (op, 2) == const0_rtx
5978       && GET_CODE (XEXP (op, 0)) == MULT
5979       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5980                                          XEXP (op, 1)))
5981     return XEXP (XEXP (op, 0), 0);
5982
5983   /* It can also be represented (for zero-extend) as an AND with an
5984      immediate.  */
5985   if (GET_CODE (op) == AND
5986       && GET_CODE (XEXP (op, 0)) == MULT
5987       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5988       && CONST_INT_P (XEXP (op, 1))
5989       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5990                            INTVAL (XEXP (op, 1))) != 0)
5991     return XEXP (XEXP (op, 0), 0);
5992
5993   /* Now handle extended register, as this may also have an optional
5994      left shift by 1..4.  */
5995   if (GET_CODE (op) == ASHIFT
5996       && CONST_INT_P (XEXP (op, 1))
5997       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5998     op = XEXP (op, 0);
5999
6000   if (GET_CODE (op) == ZERO_EXTEND
6001       || GET_CODE (op) == SIGN_EXTEND)
6002     op = XEXP (op, 0);
6003
6004   if (op != x)
6005     return op;
6006
6007   return x;
6008 }
6009
6010 /* Return true iff CODE is a shift supported in combination
6011    with arithmetic instructions.  */
6012
6013 static bool
6014 aarch64_shift_p (enum rtx_code code)
6015 {
6016   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6017 }
6018
6019 /* Helper function for rtx cost calculation.  Calculate the cost of
6020    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6021    Return the calculated cost of the expression, recursing manually in to
6022    operands where needed.  */
6023
6024 static int
6025 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6026 {
6027   rtx op0, op1;
6028   const struct cpu_cost_table *extra_cost
6029     = aarch64_tune_params.insn_extra_cost;
6030   int cost = 0;
6031   bool compound_p = (outer == PLUS || outer == MINUS);
6032   machine_mode mode = GET_MODE (x);
6033
6034   gcc_checking_assert (code == MULT);
6035
6036   op0 = XEXP (x, 0);
6037   op1 = XEXP (x, 1);
6038
6039   if (VECTOR_MODE_P (mode))
6040     mode = GET_MODE_INNER (mode);
6041
6042   /* Integer multiply/fma.  */
6043   if (GET_MODE_CLASS (mode) == MODE_INT)
6044     {
6045       /* The multiply will be canonicalized as a shift, cost it as such.  */
6046       if (aarch64_shift_p (GET_CODE (x))
6047           || (CONST_INT_P (op1)
6048               && exact_log2 (INTVAL (op1)) > 0))
6049         {
6050           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6051                            || GET_CODE (op0) == SIGN_EXTEND;
6052           if (speed)
6053             {
6054               if (compound_p)
6055                 {
6056                   if (REG_P (op1))
6057                     /* ARITH + shift-by-register.  */
6058                     cost += extra_cost->alu.arith_shift_reg;
6059                   else if (is_extend)
6060                     /* ARITH + extended register.  We don't have a cost field
6061                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6062                     cost += extra_cost->alu.extend_arith;
6063                   else
6064                     /* ARITH + shift-by-immediate.  */
6065                     cost += extra_cost->alu.arith_shift;
6066                 }
6067               else
6068                 /* LSL (immediate).  */
6069                 cost += extra_cost->alu.shift;
6070
6071             }
6072           /* Strip extends as we will have costed them in the case above.  */
6073           if (is_extend)
6074             op0 = aarch64_strip_extend (op0);
6075
6076           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6077
6078           return cost;
6079         }
6080
6081       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6082          compound and let the below cases handle it.  After all, MNEG is a
6083          special-case alias of MSUB.  */
6084       if (GET_CODE (op0) == NEG)
6085         {
6086           op0 = XEXP (op0, 0);
6087           compound_p = true;
6088         }
6089
6090       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6091       if ((GET_CODE (op0) == ZERO_EXTEND
6092            && GET_CODE (op1) == ZERO_EXTEND)
6093           || (GET_CODE (op0) == SIGN_EXTEND
6094               && GET_CODE (op1) == SIGN_EXTEND))
6095         {
6096           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6097           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6098
6099           if (speed)
6100             {
6101               if (compound_p)
6102                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6103                 cost += extra_cost->mult[0].extend_add;
6104               else
6105                 /* MUL/SMULL/UMULL.  */
6106                 cost += extra_cost->mult[0].extend;
6107             }
6108
6109           return cost;
6110         }
6111
6112       /* This is either an integer multiply or a MADD.  In both cases
6113          we want to recurse and cost the operands.  */
6114       cost += rtx_cost (op0, mode, MULT, 0, speed);
6115       cost += rtx_cost (op1, mode, MULT, 1, speed);
6116
6117       if (speed)
6118         {
6119           if (compound_p)
6120             /* MADD/MSUB.  */
6121             cost += extra_cost->mult[mode == DImode].add;
6122           else
6123             /* MUL.  */
6124             cost += extra_cost->mult[mode == DImode].simple;
6125         }
6126
6127       return cost;
6128     }
6129   else
6130     {
6131       if (speed)
6132         {
6133           /* Floating-point FMA/FMUL can also support negations of the
6134              operands, unless the rounding mode is upward or downward in
6135              which case FNMUL is different than FMUL with operand negation.  */
6136           bool neg0 = GET_CODE (op0) == NEG;
6137           bool neg1 = GET_CODE (op1) == NEG;
6138           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6139             {
6140               if (neg0)
6141                 op0 = XEXP (op0, 0);
6142               if (neg1)
6143                 op1 = XEXP (op1, 0);
6144             }
6145
6146           if (compound_p)
6147             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6148             cost += extra_cost->fp[mode == DFmode].fma;
6149           else
6150             /* FMUL/FNMUL.  */
6151             cost += extra_cost->fp[mode == DFmode].mult;
6152         }
6153
6154       cost += rtx_cost (op0, mode, MULT, 0, speed);
6155       cost += rtx_cost (op1, mode, MULT, 1, speed);
6156       return cost;
6157     }
6158 }
6159
6160 static int
6161 aarch64_address_cost (rtx x,
6162                       machine_mode mode,
6163                       addr_space_t as ATTRIBUTE_UNUSED,
6164                       bool speed)
6165 {
6166   enum rtx_code c = GET_CODE (x);
6167   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6168   struct aarch64_address_info info;
6169   int cost = 0;
6170   info.shift = 0;
6171
6172   if (!aarch64_classify_address (&info, x, mode, c, false))
6173     {
6174       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6175         {
6176           /* This is a CONST or SYMBOL ref which will be split
6177              in a different way depending on the code model in use.
6178              Cost it through the generic infrastructure.  */
6179           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6180           /* Divide through by the cost of one instruction to
6181              bring it to the same units as the address costs.  */
6182           cost_symbol_ref /= COSTS_N_INSNS (1);
6183           /* The cost is then the cost of preparing the address,
6184              followed by an immediate (possibly 0) offset.  */
6185           return cost_symbol_ref + addr_cost->imm_offset;
6186         }
6187       else
6188         {
6189           /* This is most likely a jump table from a case
6190              statement.  */
6191           return addr_cost->register_offset;
6192         }
6193     }
6194
6195   switch (info.type)
6196     {
6197       case ADDRESS_LO_SUM:
6198       case ADDRESS_SYMBOLIC:
6199       case ADDRESS_REG_IMM:
6200         cost += addr_cost->imm_offset;
6201         break;
6202
6203       case ADDRESS_REG_WB:
6204         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6205           cost += addr_cost->pre_modify;
6206         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6207           cost += addr_cost->post_modify;
6208         else
6209           gcc_unreachable ();
6210
6211         break;
6212
6213       case ADDRESS_REG_REG:
6214         cost += addr_cost->register_offset;
6215         break;
6216
6217       case ADDRESS_REG_SXTW:
6218         cost += addr_cost->register_sextend;
6219         break;
6220
6221       case ADDRESS_REG_UXTW:
6222         cost += addr_cost->register_zextend;
6223         break;
6224
6225       default:
6226         gcc_unreachable ();
6227     }
6228
6229
6230   if (info.shift > 0)
6231     {
6232       /* For the sake of calculating the cost of the shifted register
6233          component, we can treat same sized modes in the same way.  */
6234       switch (GET_MODE_BITSIZE (mode))
6235         {
6236           case 16:
6237             cost += addr_cost->addr_scale_costs.hi;
6238             break;
6239
6240           case 32:
6241             cost += addr_cost->addr_scale_costs.si;
6242             break;
6243
6244           case 64:
6245             cost += addr_cost->addr_scale_costs.di;
6246             break;
6247
6248           /* We can't tell, or this is a 128-bit vector.  */
6249           default:
6250             cost += addr_cost->addr_scale_costs.ti;
6251             break;
6252         }
6253     }
6254
6255   return cost;
6256 }
6257
6258 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6259    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6260    to be taken.  */
6261
6262 int
6263 aarch64_branch_cost (bool speed_p, bool predictable_p)
6264 {
6265   /* When optimizing for speed, use the cost of unpredictable branches.  */
6266   const struct cpu_branch_cost *branch_costs =
6267     aarch64_tune_params.branch_costs;
6268
6269   if (!speed_p || predictable_p)
6270     return branch_costs->predictable;
6271   else
6272     return branch_costs->unpredictable;
6273 }
6274
6275 /* Return true if the RTX X in mode MODE is a zero or sign extract
6276    usable in an ADD or SUB (extended register) instruction.  */
6277 static bool
6278 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6279 {
6280   /* Catch add with a sign extract.
6281      This is add_<optab><mode>_multp2.  */
6282   if (GET_CODE (x) == SIGN_EXTRACT
6283       || GET_CODE (x) == ZERO_EXTRACT)
6284     {
6285       rtx op0 = XEXP (x, 0);
6286       rtx op1 = XEXP (x, 1);
6287       rtx op2 = XEXP (x, 2);
6288
6289       if (GET_CODE (op0) == MULT
6290           && CONST_INT_P (op1)
6291           && op2 == const0_rtx
6292           && CONST_INT_P (XEXP (op0, 1))
6293           && aarch64_is_extend_from_extract (mode,
6294                                              XEXP (op0, 1),
6295                                              op1))
6296         {
6297           return true;
6298         }
6299     }
6300   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6301      No shift.  */
6302   else if (GET_CODE (x) == SIGN_EXTEND
6303            || GET_CODE (x) == ZERO_EXTEND)
6304     return REG_P (XEXP (x, 0));
6305
6306   return false;
6307 }
6308
6309 static bool
6310 aarch64_frint_unspec_p (unsigned int u)
6311 {
6312   switch (u)
6313     {
6314       case UNSPEC_FRINTZ:
6315       case UNSPEC_FRINTP:
6316       case UNSPEC_FRINTM:
6317       case UNSPEC_FRINTA:
6318       case UNSPEC_FRINTN:
6319       case UNSPEC_FRINTX:
6320       case UNSPEC_FRINTI:
6321         return true;
6322
6323       default:
6324         return false;
6325     }
6326 }
6327
6328 /* Return true iff X is an rtx that will match an extr instruction
6329    i.e. as described in the *extr<mode>5_insn family of patterns.
6330    OP0 and OP1 will be set to the operands of the shifts involved
6331    on success and will be NULL_RTX otherwise.  */
6332
6333 static bool
6334 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6335 {
6336   rtx op0, op1;
6337   machine_mode mode = GET_MODE (x);
6338
6339   *res_op0 = NULL_RTX;
6340   *res_op1 = NULL_RTX;
6341
6342   if (GET_CODE (x) != IOR)
6343     return false;
6344
6345   op0 = XEXP (x, 0);
6346   op1 = XEXP (x, 1);
6347
6348   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6349       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6350     {
6351      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6352       if (GET_CODE (op1) == ASHIFT)
6353         std::swap (op0, op1);
6354
6355       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6356         return false;
6357
6358       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6359       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6360
6361       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6362           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6363         {
6364           *res_op0 = XEXP (op0, 0);
6365           *res_op1 = XEXP (op1, 0);
6366           return true;
6367         }
6368     }
6369
6370   return false;
6371 }
6372
6373 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6374    storing it in *COST.  Result is true if the total cost of the operation
6375    has now been calculated.  */
6376 static bool
6377 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6378 {
6379   rtx inner;
6380   rtx comparator;
6381   enum rtx_code cmpcode;
6382
6383   if (COMPARISON_P (op0))
6384     {
6385       inner = XEXP (op0, 0);
6386       comparator = XEXP (op0, 1);
6387       cmpcode = GET_CODE (op0);
6388     }
6389   else
6390     {
6391       inner = op0;
6392       comparator = const0_rtx;
6393       cmpcode = NE;
6394     }
6395
6396   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6397     {
6398       /* Conditional branch.  */
6399       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6400         return true;
6401       else
6402         {
6403           if (cmpcode == NE || cmpcode == EQ)
6404             {
6405               if (comparator == const0_rtx)
6406                 {
6407                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6408                   if (GET_CODE (inner) == ZERO_EXTRACT)
6409                     /* TBZ/TBNZ.  */
6410                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6411                                        ZERO_EXTRACT, 0, speed);
6412                   else
6413                     /* CBZ/CBNZ.  */
6414                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6415
6416                 return true;
6417               }
6418             }
6419           else if (cmpcode == LT || cmpcode == GE)
6420             {
6421               /* TBZ/TBNZ.  */
6422               if (comparator == const0_rtx)
6423                 return true;
6424             }
6425         }
6426     }
6427   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6428     {
6429       /* CCMP.  */
6430       if (GET_CODE (op1) == COMPARE)
6431         {
6432           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6433           if (XEXP (op1, 1) == const0_rtx)
6434             *cost += 1;
6435           if (speed)
6436             {
6437               machine_mode mode = GET_MODE (XEXP (op1, 0));
6438               const struct cpu_cost_table *extra_cost
6439                 = aarch64_tune_params.insn_extra_cost;
6440
6441               if (GET_MODE_CLASS (mode) == MODE_INT)
6442                 *cost += extra_cost->alu.arith;
6443               else
6444                 *cost += extra_cost->fp[mode == DFmode].compare;
6445             }
6446           return true;
6447         }
6448
6449       /* It's a conditional operation based on the status flags,
6450          so it must be some flavor of CSEL.  */
6451
6452       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6453       if (GET_CODE (op1) == NEG
6454           || GET_CODE (op1) == NOT
6455           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6456         op1 = XEXP (op1, 0);
6457       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6458         {
6459           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6460           op1 = XEXP (op1, 0);
6461           op2 = XEXP (op2, 0);
6462         }
6463
6464       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6465       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6466       return true;
6467     }
6468
6469   /* We don't know what this is, cost all operands.  */
6470   return false;
6471 }
6472
6473 /* Check whether X is a bitfield operation of the form shift + extend that
6474    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6475    operand to which the bitfield operation is applied.  Otherwise return
6476    NULL_RTX.  */
6477
6478 static rtx
6479 aarch64_extend_bitfield_pattern_p (rtx x)
6480 {
6481   rtx_code outer_code = GET_CODE (x);
6482   machine_mode outer_mode = GET_MODE (x);
6483
6484   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6485       && outer_mode != SImode && outer_mode != DImode)
6486     return NULL_RTX;
6487
6488   rtx inner = XEXP (x, 0);
6489   rtx_code inner_code = GET_CODE (inner);
6490   machine_mode inner_mode = GET_MODE (inner);
6491   rtx op = NULL_RTX;
6492
6493   switch (inner_code)
6494     {
6495       case ASHIFT:
6496         if (CONST_INT_P (XEXP (inner, 1))
6497             && (inner_mode == QImode || inner_mode == HImode))
6498           op = XEXP (inner, 0);
6499         break;
6500       case LSHIFTRT:
6501         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6502             && (inner_mode == QImode || inner_mode == HImode))
6503           op = XEXP (inner, 0);
6504         break;
6505       case ASHIFTRT:
6506         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6507             && (inner_mode == QImode || inner_mode == HImode))
6508           op = XEXP (inner, 0);
6509         break;
6510       default:
6511         break;
6512     }
6513
6514   return op;
6515 }
6516
6517 /* Return true if the mask and a shift amount from an RTX of the form
6518    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6519    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6520
6521 bool
6522 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6523 {
6524   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6525          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6526          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6527          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6528 }
6529
6530 /* Calculate the cost of calculating X, storing it in *COST.  Result
6531    is true if the total cost of the operation has now been calculated.  */
6532 static bool
6533 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6534                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6535 {
6536   rtx op0, op1, op2;
6537   const struct cpu_cost_table *extra_cost
6538     = aarch64_tune_params.insn_extra_cost;
6539   int code = GET_CODE (x);
6540
6541   /* By default, assume that everything has equivalent cost to the
6542      cheapest instruction.  Any additional costs are applied as a delta
6543      above this default.  */
6544   *cost = COSTS_N_INSNS (1);
6545
6546   switch (code)
6547     {
6548     case SET:
6549       /* The cost depends entirely on the operands to SET.  */
6550       *cost = 0;
6551       op0 = SET_DEST (x);
6552       op1 = SET_SRC (x);
6553
6554       switch (GET_CODE (op0))
6555         {
6556         case MEM:
6557           if (speed)
6558             {
6559               rtx address = XEXP (op0, 0);
6560               if (VECTOR_MODE_P (mode))
6561                 *cost += extra_cost->ldst.storev;
6562               else if (GET_MODE_CLASS (mode) == MODE_INT)
6563                 *cost += extra_cost->ldst.store;
6564               else if (mode == SFmode)
6565                 *cost += extra_cost->ldst.storef;
6566               else if (mode == DFmode)
6567                 *cost += extra_cost->ldst.stored;
6568
6569               *cost +=
6570                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6571                                                      0, speed));
6572             }
6573
6574           *cost += rtx_cost (op1, mode, SET, 1, speed);
6575           return true;
6576
6577         case SUBREG:
6578           if (! REG_P (SUBREG_REG (op0)))
6579             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6580
6581           /* Fall through.  */
6582         case REG:
6583           /* The cost is one per vector-register copied.  */
6584           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6585             {
6586               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6587                               / GET_MODE_SIZE (V4SImode);
6588               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6589             }
6590           /* const0_rtx is in general free, but we will use an
6591              instruction to set a register to 0.  */
6592           else if (REG_P (op1) || op1 == const0_rtx)
6593             {
6594               /* The cost is 1 per register copied.  */
6595               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6596                               / UNITS_PER_WORD;
6597               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6598             }
6599           else
6600             /* Cost is just the cost of the RHS of the set.  */
6601             *cost += rtx_cost (op1, mode, SET, 1, speed);
6602           return true;
6603
6604         case ZERO_EXTRACT:
6605         case SIGN_EXTRACT:
6606           /* Bit-field insertion.  Strip any redundant widening of
6607              the RHS to meet the width of the target.  */
6608           if (GET_CODE (op1) == SUBREG)
6609             op1 = SUBREG_REG (op1);
6610           if ((GET_CODE (op1) == ZERO_EXTEND
6611                || GET_CODE (op1) == SIGN_EXTEND)
6612               && CONST_INT_P (XEXP (op0, 1))
6613               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6614                   >= INTVAL (XEXP (op0, 1))))
6615             op1 = XEXP (op1, 0);
6616
6617           if (CONST_INT_P (op1))
6618             {
6619               /* MOV immediate is assumed to always be cheap.  */
6620               *cost = COSTS_N_INSNS (1);
6621             }
6622           else
6623             {
6624               /* BFM.  */
6625               if (speed)
6626                 *cost += extra_cost->alu.bfi;
6627               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6628             }
6629
6630           return true;
6631
6632         default:
6633           /* We can't make sense of this, assume default cost.  */
6634           *cost = COSTS_N_INSNS (1);
6635           return false;
6636         }
6637       return false;
6638
6639     case CONST_INT:
6640       /* If an instruction can incorporate a constant within the
6641          instruction, the instruction's expression avoids calling
6642          rtx_cost() on the constant.  If rtx_cost() is called on a
6643          constant, then it is usually because the constant must be
6644          moved into a register by one or more instructions.
6645
6646          The exception is constant 0, which can be expressed
6647          as XZR/WZR and is therefore free.  The exception to this is
6648          if we have (set (reg) (const0_rtx)) in which case we must cost
6649          the move.  However, we can catch that when we cost the SET, so
6650          we don't need to consider that here.  */
6651       if (x == const0_rtx)
6652         *cost = 0;
6653       else
6654         {
6655           /* To an approximation, building any other constant is
6656              proportionally expensive to the number of instructions
6657              required to build that constant.  This is true whether we
6658              are compiling for SPEED or otherwise.  */
6659           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6660                                  (NULL_RTX, x, false, mode));
6661         }
6662       return true;
6663
6664     case CONST_DOUBLE:
6665       if (speed)
6666         {
6667           /* mov[df,sf]_aarch64.  */
6668           if (aarch64_float_const_representable_p (x))
6669             /* FMOV (scalar immediate).  */
6670             *cost += extra_cost->fp[mode == DFmode].fpconst;
6671           else if (!aarch64_float_const_zero_rtx_p (x))
6672             {
6673               /* This will be a load from memory.  */
6674               if (mode == DFmode)
6675                 *cost += extra_cost->ldst.loadd;
6676               else
6677                 *cost += extra_cost->ldst.loadf;
6678             }
6679           else
6680             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6681                or MOV v0.s[0], wzr - neither of which are modeled by the
6682                cost tables.  Just use the default cost.  */
6683             {
6684             }
6685         }
6686
6687       return true;
6688
6689     case MEM:
6690       if (speed)
6691         {
6692           /* For loads we want the base cost of a load, plus an
6693              approximation for the additional cost of the addressing
6694              mode.  */
6695           rtx address = XEXP (x, 0);
6696           if (VECTOR_MODE_P (mode))
6697             *cost += extra_cost->ldst.loadv;
6698           else if (GET_MODE_CLASS (mode) == MODE_INT)
6699             *cost += extra_cost->ldst.load;
6700           else if (mode == SFmode)
6701             *cost += extra_cost->ldst.loadf;
6702           else if (mode == DFmode)
6703             *cost += extra_cost->ldst.loadd;
6704
6705           *cost +=
6706                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6707                                                      0, speed));
6708         }
6709
6710       return true;
6711
6712     case NEG:
6713       op0 = XEXP (x, 0);
6714
6715       if (VECTOR_MODE_P (mode))
6716         {
6717           if (speed)
6718             {
6719               /* FNEG.  */
6720               *cost += extra_cost->vect.alu;
6721             }
6722           return false;
6723         }
6724
6725       if (GET_MODE_CLASS (mode) == MODE_INT)
6726         {
6727           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6728               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6729             {
6730               /* CSETM.  */
6731               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6732               return true;
6733             }
6734
6735           /* Cost this as SUB wzr, X.  */
6736           op0 = CONST0_RTX (mode);
6737           op1 = XEXP (x, 0);
6738           goto cost_minus;
6739         }
6740
6741       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6742         {
6743           /* Support (neg(fma...)) as a single instruction only if
6744              sign of zeros is unimportant.  This matches the decision
6745              making in aarch64.md.  */
6746           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6747             {
6748               /* FNMADD.  */
6749               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6750               return true;
6751             }
6752           if (GET_CODE (op0) == MULT)
6753             {
6754               /* FNMUL.  */
6755               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6756               return true;
6757             }
6758           if (speed)
6759             /* FNEG.  */
6760             *cost += extra_cost->fp[mode == DFmode].neg;
6761           return false;
6762         }
6763
6764       return false;
6765
6766     case CLRSB:
6767     case CLZ:
6768       if (speed)
6769         {
6770           if (VECTOR_MODE_P (mode))
6771             *cost += extra_cost->vect.alu;
6772           else
6773             *cost += extra_cost->alu.clz;
6774         }
6775
6776       return false;
6777
6778     case COMPARE:
6779       op0 = XEXP (x, 0);
6780       op1 = XEXP (x, 1);
6781
6782       if (op1 == const0_rtx
6783           && GET_CODE (op0) == AND)
6784         {
6785           x = op0;
6786           mode = GET_MODE (op0);
6787           goto cost_logic;
6788         }
6789
6790       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6791         {
6792           /* TODO: A write to the CC flags possibly costs extra, this
6793              needs encoding in the cost tables.  */
6794
6795           mode = GET_MODE (op0);
6796           /* ANDS.  */
6797           if (GET_CODE (op0) == AND)
6798             {
6799               x = op0;
6800               goto cost_logic;
6801             }
6802
6803           if (GET_CODE (op0) == PLUS)
6804             {
6805               /* ADDS (and CMN alias).  */
6806               x = op0;
6807               goto cost_plus;
6808             }
6809
6810           if (GET_CODE (op0) == MINUS)
6811             {
6812               /* SUBS.  */
6813               x = op0;
6814               goto cost_minus;
6815             }
6816
6817           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6818               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6819               && CONST_INT_P (XEXP (op0, 2)))
6820             {
6821               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6822                  Handle it here directly rather than going to cost_logic
6823                  since we know the immediate generated for the TST is valid
6824                  so we can avoid creating an intermediate rtx for it only
6825                  for costing purposes.  */
6826               if (speed)
6827                 *cost += extra_cost->alu.logical;
6828
6829               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6830                                  ZERO_EXTRACT, 0, speed);
6831               return true;
6832             }
6833
6834           if (GET_CODE (op1) == NEG)
6835             {
6836               /* CMN.  */
6837               if (speed)
6838                 *cost += extra_cost->alu.arith;
6839
6840               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6841               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6842               return true;
6843             }
6844
6845           /* CMP.
6846
6847              Compare can freely swap the order of operands, and
6848              canonicalization puts the more complex operation first.
6849              But the integer MINUS logic expects the shift/extend
6850              operation in op1.  */
6851           if (! (REG_P (op0)
6852                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6853           {
6854             op0 = XEXP (x, 1);
6855             op1 = XEXP (x, 0);
6856           }
6857           goto cost_minus;
6858         }
6859
6860       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6861         {
6862           /* FCMP.  */
6863           if (speed)
6864             *cost += extra_cost->fp[mode == DFmode].compare;
6865
6866           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6867             {
6868               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6869               /* FCMP supports constant 0.0 for no extra cost. */
6870               return true;
6871             }
6872           return false;
6873         }
6874
6875       if (VECTOR_MODE_P (mode))
6876         {
6877           /* Vector compare.  */
6878           if (speed)
6879             *cost += extra_cost->vect.alu;
6880
6881           if (aarch64_float_const_zero_rtx_p (op1))
6882             {
6883               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6884                  cost.  */
6885               return true;
6886             }
6887           return false;
6888         }
6889       return false;
6890
6891     case MINUS:
6892       {
6893         op0 = XEXP (x, 0);
6894         op1 = XEXP (x, 1);
6895
6896 cost_minus:
6897         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6898
6899         /* Detect valid immediates.  */
6900         if ((GET_MODE_CLASS (mode) == MODE_INT
6901              || (GET_MODE_CLASS (mode) == MODE_CC
6902                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6903             && CONST_INT_P (op1)
6904             && aarch64_uimm12_shift (INTVAL (op1)))
6905           {
6906             if (speed)
6907               /* SUB(S) (immediate).  */
6908               *cost += extra_cost->alu.arith;
6909             return true;
6910           }
6911
6912         /* Look for SUB (extended register).  */
6913         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6914           {
6915             if (speed)
6916               *cost += extra_cost->alu.extend_arith;
6917
6918             op1 = aarch64_strip_extend (op1);
6919             *cost += rtx_cost (op1, VOIDmode,
6920                                (enum rtx_code) GET_CODE (op1), 0, speed);
6921             return true;
6922           }
6923
6924         rtx new_op1 = aarch64_strip_extend (op1);
6925
6926         /* Cost this as an FMA-alike operation.  */
6927         if ((GET_CODE (new_op1) == MULT
6928              || aarch64_shift_p (GET_CODE (new_op1)))
6929             && code != COMPARE)
6930           {
6931             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6932                                             (enum rtx_code) code,
6933                                             speed);
6934             return true;
6935           }
6936
6937         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6938
6939         if (speed)
6940           {
6941             if (VECTOR_MODE_P (mode))
6942               {
6943                 /* Vector SUB.  */
6944                 *cost += extra_cost->vect.alu;
6945               }
6946             else if (GET_MODE_CLASS (mode) == MODE_INT)
6947               {
6948                 /* SUB(S).  */
6949                 *cost += extra_cost->alu.arith;
6950               }
6951             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6952               {
6953                 /* FSUB.  */
6954                 *cost += extra_cost->fp[mode == DFmode].addsub;
6955               }
6956           }
6957         return true;
6958       }
6959
6960     case PLUS:
6961       {
6962         rtx new_op0;
6963
6964         op0 = XEXP (x, 0);
6965         op1 = XEXP (x, 1);
6966
6967 cost_plus:
6968         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6969             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6970           {
6971             /* CSINC.  */
6972             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6973             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6974             return true;
6975           }
6976
6977         if (GET_MODE_CLASS (mode) == MODE_INT
6978             && CONST_INT_P (op1)
6979             && aarch64_uimm12_shift (INTVAL (op1)))
6980           {
6981             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6982
6983             if (speed)
6984               /* ADD (immediate).  */
6985               *cost += extra_cost->alu.arith;
6986             return true;
6987           }
6988
6989         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6990
6991         /* Look for ADD (extended register).  */
6992         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6993           {
6994             if (speed)
6995               *cost += extra_cost->alu.extend_arith;
6996
6997             op0 = aarch64_strip_extend (op0);
6998             *cost += rtx_cost (op0, VOIDmode,
6999                                (enum rtx_code) GET_CODE (op0), 0, speed);
7000             return true;
7001           }
7002
7003         /* Strip any extend, leave shifts behind as we will
7004            cost them through mult_cost.  */
7005         new_op0 = aarch64_strip_extend (op0);
7006
7007         if (GET_CODE (new_op0) == MULT
7008             || aarch64_shift_p (GET_CODE (new_op0)))
7009           {
7010             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7011                                             speed);
7012             return true;
7013           }
7014
7015         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7016
7017         if (speed)
7018           {
7019             if (VECTOR_MODE_P (mode))
7020               {
7021                 /* Vector ADD.  */
7022                 *cost += extra_cost->vect.alu;
7023               }
7024             else if (GET_MODE_CLASS (mode) == MODE_INT)
7025               {
7026                 /* ADD.  */
7027                 *cost += extra_cost->alu.arith;
7028               }
7029             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7030               {
7031                 /* FADD.  */
7032                 *cost += extra_cost->fp[mode == DFmode].addsub;
7033               }
7034           }
7035         return true;
7036       }
7037
7038     case BSWAP:
7039       *cost = COSTS_N_INSNS (1);
7040
7041       if (speed)
7042         {
7043           if (VECTOR_MODE_P (mode))
7044             *cost += extra_cost->vect.alu;
7045           else
7046             *cost += extra_cost->alu.rev;
7047         }
7048       return false;
7049
7050     case IOR:
7051       if (aarch_rev16_p (x))
7052         {
7053           *cost = COSTS_N_INSNS (1);
7054
7055           if (speed)
7056             {
7057               if (VECTOR_MODE_P (mode))
7058                 *cost += extra_cost->vect.alu;
7059               else
7060                 *cost += extra_cost->alu.rev;
7061             }
7062           return true;
7063         }
7064
7065       if (aarch64_extr_rtx_p (x, &op0, &op1))
7066         {
7067           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7068           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7069           if (speed)
7070             *cost += extra_cost->alu.shift;
7071
7072           return true;
7073         }
7074     /* Fall through.  */
7075     case XOR:
7076     case AND:
7077     cost_logic:
7078       op0 = XEXP (x, 0);
7079       op1 = XEXP (x, 1);
7080
7081       if (VECTOR_MODE_P (mode))
7082         {
7083           if (speed)
7084             *cost += extra_cost->vect.alu;
7085           return true;
7086         }
7087
7088       if (code == AND
7089           && GET_CODE (op0) == MULT
7090           && CONST_INT_P (XEXP (op0, 1))
7091           && CONST_INT_P (op1)
7092           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7093                                INTVAL (op1)) != 0)
7094         {
7095           /* This is a UBFM/SBFM.  */
7096           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7097           if (speed)
7098             *cost += extra_cost->alu.bfx;
7099           return true;
7100         }
7101
7102       if (GET_MODE_CLASS (mode) == MODE_INT)
7103         {
7104           if (CONST_INT_P (op1))
7105             {
7106               /* We have a mask + shift version of a UBFIZ
7107                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7108               if (GET_CODE (op0) == ASHIFT
7109                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7110                                                           XEXP (op0, 1)))
7111                 {
7112                   *cost += rtx_cost (XEXP (op0, 0), mode,
7113                                      (enum rtx_code) code, 0, speed);
7114                   if (speed)
7115                     *cost += extra_cost->alu.bfx;
7116
7117                   return true;
7118                 }
7119               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7120                 {
7121                 /* We possibly get the immediate for free, this is not
7122                    modelled.  */
7123                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7124                   if (speed)
7125                     *cost += extra_cost->alu.logical;
7126
7127                   return true;
7128                 }
7129             }
7130           else
7131             {
7132               rtx new_op0 = op0;
7133
7134               /* Handle ORN, EON, or BIC.  */
7135               if (GET_CODE (op0) == NOT)
7136                 op0 = XEXP (op0, 0);
7137
7138               new_op0 = aarch64_strip_shift (op0);
7139
7140               /* If we had a shift on op0 then this is a logical-shift-
7141                  by-register/immediate operation.  Otherwise, this is just
7142                  a logical operation.  */
7143               if (speed)
7144                 {
7145                   if (new_op0 != op0)
7146                     {
7147                       /* Shift by immediate.  */
7148                       if (CONST_INT_P (XEXP (op0, 1)))
7149                         *cost += extra_cost->alu.log_shift;
7150                       else
7151                         *cost += extra_cost->alu.log_shift_reg;
7152                     }
7153                   else
7154                     *cost += extra_cost->alu.logical;
7155                 }
7156
7157               /* In both cases we want to cost both operands.  */
7158               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7159               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7160
7161               return true;
7162             }
7163         }
7164       return false;
7165
7166     case NOT:
7167       x = XEXP (x, 0);
7168       op0 = aarch64_strip_shift (x);
7169
7170       if (VECTOR_MODE_P (mode))
7171         {
7172           /* Vector NOT.  */
7173           *cost += extra_cost->vect.alu;
7174           return false;
7175         }
7176
7177       /* MVN-shifted-reg.  */
7178       if (op0 != x)
7179         {
7180           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7181
7182           if (speed)
7183             *cost += extra_cost->alu.log_shift;
7184
7185           return true;
7186         }
7187       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7188          Handle the second form here taking care that 'a' in the above can
7189          be a shift.  */
7190       else if (GET_CODE (op0) == XOR)
7191         {
7192           rtx newop0 = XEXP (op0, 0);
7193           rtx newop1 = XEXP (op0, 1);
7194           rtx op0_stripped = aarch64_strip_shift (newop0);
7195
7196           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7197           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7198
7199           if (speed)
7200             {
7201               if (op0_stripped != newop0)
7202                 *cost += extra_cost->alu.log_shift;
7203               else
7204                 *cost += extra_cost->alu.logical;
7205             }
7206
7207           return true;
7208         }
7209       /* MVN.  */
7210       if (speed)
7211         *cost += extra_cost->alu.logical;
7212
7213       return false;
7214
7215     case ZERO_EXTEND:
7216
7217       op0 = XEXP (x, 0);
7218       /* If a value is written in SI mode, then zero extended to DI
7219          mode, the operation will in general be free as a write to
7220          a 'w' register implicitly zeroes the upper bits of an 'x'
7221          register.  However, if this is
7222
7223            (set (reg) (zero_extend (reg)))
7224
7225          we must cost the explicit register move.  */
7226       if (mode == DImode
7227           && GET_MODE (op0) == SImode
7228           && outer == SET)
7229         {
7230           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7231
7232         /* If OP_COST is non-zero, then the cost of the zero extend
7233            is effectively the cost of the inner operation.  Otherwise
7234            we have a MOV instruction and we take the cost from the MOV
7235            itself.  This is true independently of whether we are
7236            optimizing for space or time.  */
7237           if (op_cost)
7238             *cost = op_cost;
7239
7240           return true;
7241         }
7242       else if (MEM_P (op0))
7243         {
7244           /* All loads can zero extend to any size for free.  */
7245           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7246           return true;
7247         }
7248
7249       op0 = aarch64_extend_bitfield_pattern_p (x);
7250       if (op0)
7251         {
7252           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7253           if (speed)
7254             *cost += extra_cost->alu.bfx;
7255           return true;
7256         }
7257
7258       if (speed)
7259         {
7260           if (VECTOR_MODE_P (mode))
7261             {
7262               /* UMOV.  */
7263               *cost += extra_cost->vect.alu;
7264             }
7265           else
7266             {
7267               /* We generate an AND instead of UXTB/UXTH.  */
7268               *cost += extra_cost->alu.logical;
7269             }
7270         }
7271       return false;
7272
7273     case SIGN_EXTEND:
7274       if (MEM_P (XEXP (x, 0)))
7275         {
7276           /* LDRSH.  */
7277           if (speed)
7278             {
7279               rtx address = XEXP (XEXP (x, 0), 0);
7280               *cost += extra_cost->ldst.load_sign_extend;
7281
7282               *cost +=
7283                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7284                                                      0, speed));
7285             }
7286           return true;
7287         }
7288
7289       op0 = aarch64_extend_bitfield_pattern_p (x);
7290       if (op0)
7291         {
7292           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7293           if (speed)
7294             *cost += extra_cost->alu.bfx;
7295           return true;
7296         }
7297
7298       if (speed)
7299         {
7300           if (VECTOR_MODE_P (mode))
7301             *cost += extra_cost->vect.alu;
7302           else
7303             *cost += extra_cost->alu.extend;
7304         }
7305       return false;
7306
7307     case ASHIFT:
7308       op0 = XEXP (x, 0);
7309       op1 = XEXP (x, 1);
7310
7311       if (CONST_INT_P (op1))
7312         {
7313           if (speed)
7314             {
7315               if (VECTOR_MODE_P (mode))
7316                 {
7317                   /* Vector shift (immediate).  */
7318                   *cost += extra_cost->vect.alu;
7319                 }
7320               else
7321                 {
7322                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7323                      aliases.  */
7324                   *cost += extra_cost->alu.shift;
7325                 }
7326             }
7327
7328           /* We can incorporate zero/sign extend for free.  */
7329           if (GET_CODE (op0) == ZERO_EXTEND
7330               || GET_CODE (op0) == SIGN_EXTEND)
7331             op0 = XEXP (op0, 0);
7332
7333           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7334           return true;
7335         }
7336       else
7337         {
7338           if (speed)
7339             {
7340               if (VECTOR_MODE_P (mode))
7341                 {
7342                   /* Vector shift (register).  */
7343                   *cost += extra_cost->vect.alu;
7344                 }
7345               else
7346                 {
7347                   /* LSLV.  */
7348                   *cost += extra_cost->alu.shift_reg;
7349                 }
7350             }
7351           return false;  /* All arguments need to be in registers.  */
7352         }
7353
7354     case ROTATE:
7355     case ROTATERT:
7356     case LSHIFTRT:
7357     case ASHIFTRT:
7358       op0 = XEXP (x, 0);
7359       op1 = XEXP (x, 1);
7360
7361       if (CONST_INT_P (op1))
7362         {
7363           /* ASR (immediate) and friends.  */
7364           if (speed)
7365             {
7366               if (VECTOR_MODE_P (mode))
7367                 *cost += extra_cost->vect.alu;
7368               else
7369                 *cost += extra_cost->alu.shift;
7370             }
7371
7372           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7373           return true;
7374         }
7375       else
7376         {
7377
7378           /* ASR (register) and friends.  */
7379           if (speed)
7380             {
7381               if (VECTOR_MODE_P (mode))
7382                 *cost += extra_cost->vect.alu;
7383               else
7384                 *cost += extra_cost->alu.shift_reg;
7385             }
7386           return false;  /* All arguments need to be in registers.  */
7387         }
7388
7389     case SYMBOL_REF:
7390
7391       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7392           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7393         {
7394           /* LDR.  */
7395           if (speed)
7396             *cost += extra_cost->ldst.load;
7397         }
7398       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7399                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7400         {
7401           /* ADRP, followed by ADD.  */
7402           *cost += COSTS_N_INSNS (1);
7403           if (speed)
7404             *cost += 2 * extra_cost->alu.arith;
7405         }
7406       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7407                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7408         {
7409           /* ADR.  */
7410           if (speed)
7411             *cost += extra_cost->alu.arith;
7412         }
7413
7414       if (flag_pic)
7415         {
7416           /* One extra load instruction, after accessing the GOT.  */
7417           *cost += COSTS_N_INSNS (1);
7418           if (speed)
7419             *cost += extra_cost->ldst.load;
7420         }
7421       return true;
7422
7423     case HIGH:
7424     case LO_SUM:
7425       /* ADRP/ADD (immediate).  */
7426       if (speed)
7427         *cost += extra_cost->alu.arith;
7428       return true;
7429
7430     case ZERO_EXTRACT:
7431     case SIGN_EXTRACT:
7432       /* UBFX/SBFX.  */
7433       if (speed)
7434         {
7435           if (VECTOR_MODE_P (mode))
7436             *cost += extra_cost->vect.alu;
7437           else
7438             *cost += extra_cost->alu.bfx;
7439         }
7440
7441       /* We can trust that the immediates used will be correct (there
7442          are no by-register forms), so we need only cost op0.  */
7443       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7444       return true;
7445
7446     case MULT:
7447       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7448       /* aarch64_rtx_mult_cost always handles recursion to its
7449          operands.  */
7450       return true;
7451
7452     case MOD:
7453     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7454        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7455        an unconditional negate.  This case should only ever be reached through
7456        the set_smod_pow2_cheap check in expmed.c.  */
7457       if (CONST_INT_P (XEXP (x, 1))
7458           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7459           && (mode == SImode || mode == DImode))
7460         {
7461           /* We expand to 4 instructions.  Reset the baseline.  */
7462           *cost = COSTS_N_INSNS (4);
7463
7464           if (speed)
7465             *cost += 2 * extra_cost->alu.logical
7466                      + 2 * extra_cost->alu.arith;
7467
7468           return true;
7469         }
7470
7471     /* Fall-through.  */
7472     case UMOD:
7473       if (speed)
7474         {
7475           if (VECTOR_MODE_P (mode))
7476             *cost += extra_cost->vect.alu;
7477           else if (GET_MODE_CLASS (mode) == MODE_INT)
7478             *cost += (extra_cost->mult[mode == DImode].add
7479                       + extra_cost->mult[mode == DImode].idiv);
7480           else if (mode == DFmode)
7481             *cost += (extra_cost->fp[1].mult
7482                       + extra_cost->fp[1].div);
7483           else if (mode == SFmode)
7484             *cost += (extra_cost->fp[0].mult
7485                       + extra_cost->fp[0].div);
7486         }
7487       return false;  /* All arguments need to be in registers.  */
7488
7489     case DIV:
7490     case UDIV:
7491     case SQRT:
7492       if (speed)
7493         {
7494           if (VECTOR_MODE_P (mode))
7495             *cost += extra_cost->vect.alu;
7496           else if (GET_MODE_CLASS (mode) == MODE_INT)
7497             /* There is no integer SQRT, so only DIV and UDIV can get
7498                here.  */
7499             *cost += extra_cost->mult[mode == DImode].idiv;
7500           else
7501             *cost += extra_cost->fp[mode == DFmode].div;
7502         }
7503       return false;  /* All arguments need to be in registers.  */
7504
7505     case IF_THEN_ELSE:
7506       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7507                                          XEXP (x, 2), cost, speed);
7508
7509     case EQ:
7510     case NE:
7511     case GT:
7512     case GTU:
7513     case LT:
7514     case LTU:
7515     case GE:
7516     case GEU:
7517     case LE:
7518     case LEU:
7519
7520       return false; /* All arguments must be in registers.  */
7521
7522     case FMA:
7523       op0 = XEXP (x, 0);
7524       op1 = XEXP (x, 1);
7525       op2 = XEXP (x, 2);
7526
7527       if (speed)
7528         {
7529           if (VECTOR_MODE_P (mode))
7530             *cost += extra_cost->vect.alu;
7531           else
7532             *cost += extra_cost->fp[mode == DFmode].fma;
7533         }
7534
7535       /* FMSUB, FNMADD, and FNMSUB are free.  */
7536       if (GET_CODE (op0) == NEG)
7537         op0 = XEXP (op0, 0);
7538
7539       if (GET_CODE (op2) == NEG)
7540         op2 = XEXP (op2, 0);
7541
7542       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7543          and the by-element operand as operand 0.  */
7544       if (GET_CODE (op1) == NEG)
7545         op1 = XEXP (op1, 0);
7546
7547       /* Catch vector-by-element operations.  The by-element operand can
7548          either be (vec_duplicate (vec_select (x))) or just
7549          (vec_select (x)), depending on whether we are multiplying by
7550          a vector or a scalar.
7551
7552          Canonicalization is not very good in these cases, FMA4 will put the
7553          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7554       if (GET_CODE (op0) == VEC_DUPLICATE)
7555         op0 = XEXP (op0, 0);
7556       else if (GET_CODE (op1) == VEC_DUPLICATE)
7557         op1 = XEXP (op1, 0);
7558
7559       if (GET_CODE (op0) == VEC_SELECT)
7560         op0 = XEXP (op0, 0);
7561       else if (GET_CODE (op1) == VEC_SELECT)
7562         op1 = XEXP (op1, 0);
7563
7564       /* If the remaining parameters are not registers,
7565          get the cost to put them into registers.  */
7566       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7567       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7568       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7569       return true;
7570
7571     case FLOAT:
7572     case UNSIGNED_FLOAT:
7573       if (speed)
7574         *cost += extra_cost->fp[mode == DFmode].fromint;
7575       return false;
7576
7577     case FLOAT_EXTEND:
7578       if (speed)
7579         {
7580           if (VECTOR_MODE_P (mode))
7581             {
7582               /*Vector truncate.  */
7583               *cost += extra_cost->vect.alu;
7584             }
7585           else
7586             *cost += extra_cost->fp[mode == DFmode].widen;
7587         }
7588       return false;
7589
7590     case FLOAT_TRUNCATE:
7591       if (speed)
7592         {
7593           if (VECTOR_MODE_P (mode))
7594             {
7595               /*Vector conversion.  */
7596               *cost += extra_cost->vect.alu;
7597             }
7598           else
7599             *cost += extra_cost->fp[mode == DFmode].narrow;
7600         }
7601       return false;
7602
7603     case FIX:
7604     case UNSIGNED_FIX:
7605       x = XEXP (x, 0);
7606       /* Strip the rounding part.  They will all be implemented
7607          by the fcvt* family of instructions anyway.  */
7608       if (GET_CODE (x) == UNSPEC)
7609         {
7610           unsigned int uns_code = XINT (x, 1);
7611
7612           if (uns_code == UNSPEC_FRINTA
7613               || uns_code == UNSPEC_FRINTM
7614               || uns_code == UNSPEC_FRINTN
7615               || uns_code == UNSPEC_FRINTP
7616               || uns_code == UNSPEC_FRINTZ)
7617             x = XVECEXP (x, 0, 0);
7618         }
7619
7620       if (speed)
7621         {
7622           if (VECTOR_MODE_P (mode))
7623             *cost += extra_cost->vect.alu;
7624           else
7625             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7626         }
7627
7628       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7629          fixed-point fcvt.  */
7630       if (GET_CODE (x) == MULT
7631           && ((VECTOR_MODE_P (mode)
7632                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7633               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7634         {
7635           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7636                              0, speed);
7637           return true;
7638         }
7639
7640       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7641       return true;
7642
7643     case ABS:
7644       if (VECTOR_MODE_P (mode))
7645         {
7646           /* ABS (vector).  */
7647           if (speed)
7648             *cost += extra_cost->vect.alu;
7649         }
7650       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7651         {
7652           op0 = XEXP (x, 0);
7653
7654           /* FABD, which is analogous to FADD.  */
7655           if (GET_CODE (op0) == MINUS)
7656             {
7657               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7658               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7659               if (speed)
7660                 *cost += extra_cost->fp[mode == DFmode].addsub;
7661
7662               return true;
7663             }
7664           /* Simple FABS is analogous to FNEG.  */
7665           if (speed)
7666             *cost += extra_cost->fp[mode == DFmode].neg;
7667         }
7668       else
7669         {
7670           /* Integer ABS will either be split to
7671              two arithmetic instructions, or will be an ABS
7672              (scalar), which we don't model.  */
7673           *cost = COSTS_N_INSNS (2);
7674           if (speed)
7675             *cost += 2 * extra_cost->alu.arith;
7676         }
7677       return false;
7678
7679     case SMAX:
7680     case SMIN:
7681       if (speed)
7682         {
7683           if (VECTOR_MODE_P (mode))
7684             *cost += extra_cost->vect.alu;
7685           else
7686             {
7687               /* FMAXNM/FMINNM/FMAX/FMIN.
7688                  TODO: This may not be accurate for all implementations, but
7689                  we do not model this in the cost tables.  */
7690               *cost += extra_cost->fp[mode == DFmode].addsub;
7691             }
7692         }
7693       return false;
7694
7695     case UNSPEC:
7696       /* The floating point round to integer frint* instructions.  */
7697       if (aarch64_frint_unspec_p (XINT (x, 1)))
7698         {
7699           if (speed)
7700             *cost += extra_cost->fp[mode == DFmode].roundint;
7701
7702           return false;
7703         }
7704
7705       if (XINT (x, 1) == UNSPEC_RBIT)
7706         {
7707           if (speed)
7708             *cost += extra_cost->alu.rev;
7709
7710           return false;
7711         }
7712       break;
7713
7714     case TRUNCATE:
7715
7716       /* Decompose <su>muldi3_highpart.  */
7717       if (/* (truncate:DI  */
7718           mode == DImode
7719           /*   (lshiftrt:TI  */
7720           && GET_MODE (XEXP (x, 0)) == TImode
7721           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7722           /*      (mult:TI  */
7723           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7724           /*        (ANY_EXTEND:TI (reg:DI))
7725                     (ANY_EXTEND:TI (reg:DI)))  */
7726           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7727                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7728               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7729                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7730           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7731           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7732           /*     (const_int 64)  */
7733           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7734           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7735         {
7736           /* UMULH/SMULH.  */
7737           if (speed)
7738             *cost += extra_cost->mult[mode == DImode].extend;
7739           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7740                              mode, MULT, 0, speed);
7741           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7742                              mode, MULT, 1, speed);
7743           return true;
7744         }
7745
7746       /* Fall through.  */
7747     default:
7748       break;
7749     }
7750
7751   if (dump_file
7752       && flag_aarch64_verbose_cost)
7753     fprintf (dump_file,
7754       "\nFailed to cost RTX.  Assuming default cost.\n");
7755
7756   return true;
7757 }
7758
7759 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7760    calculated for X.  This cost is stored in *COST.  Returns true
7761    if the total cost of X was calculated.  */
7762 static bool
7763 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7764                    int param, int *cost, bool speed)
7765 {
7766   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7767
7768   if (dump_file
7769       && flag_aarch64_verbose_cost)
7770     {
7771       print_rtl_single (dump_file, x);
7772       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7773                speed ? "Hot" : "Cold",
7774                *cost, result ? "final" : "partial");
7775     }
7776
7777   return result;
7778 }
7779
7780 static int
7781 aarch64_register_move_cost (machine_mode mode,
7782                             reg_class_t from_i, reg_class_t to_i)
7783 {
7784   enum reg_class from = (enum reg_class) from_i;
7785   enum reg_class to = (enum reg_class) to_i;
7786   const struct cpu_regmove_cost *regmove_cost
7787     = aarch64_tune_params.regmove_cost;
7788
7789   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7790   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7791     to = GENERAL_REGS;
7792
7793   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7794     from = GENERAL_REGS;
7795
7796   /* Moving between GPR and stack cost is the same as GP2GP.  */
7797   if ((from == GENERAL_REGS && to == STACK_REG)
7798       || (to == GENERAL_REGS && from == STACK_REG))
7799     return regmove_cost->GP2GP;
7800
7801   /* To/From the stack register, we move via the gprs.  */
7802   if (to == STACK_REG || from == STACK_REG)
7803     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7804             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7805
7806   if (GET_MODE_SIZE (mode) == 16)
7807     {
7808       /* 128-bit operations on general registers require 2 instructions.  */
7809       if (from == GENERAL_REGS && to == GENERAL_REGS)
7810         return regmove_cost->GP2GP * 2;
7811       else if (from == GENERAL_REGS)
7812         return regmove_cost->GP2FP * 2;
7813       else if (to == GENERAL_REGS)
7814         return regmove_cost->FP2GP * 2;
7815
7816       /* When AdvSIMD instructions are disabled it is not possible to move
7817          a 128-bit value directly between Q registers.  This is handled in
7818          secondary reload.  A general register is used as a scratch to move
7819          the upper DI value and the lower DI value is moved directly,
7820          hence the cost is the sum of three moves. */
7821       if (! TARGET_SIMD)
7822         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7823
7824       return regmove_cost->FP2FP;
7825     }
7826
7827   if (from == GENERAL_REGS && to == GENERAL_REGS)
7828     return regmove_cost->GP2GP;
7829   else if (from == GENERAL_REGS)
7830     return regmove_cost->GP2FP;
7831   else if (to == GENERAL_REGS)
7832     return regmove_cost->FP2GP;
7833
7834   return regmove_cost->FP2FP;
7835 }
7836
7837 static int
7838 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7839                           reg_class_t rclass ATTRIBUTE_UNUSED,
7840                           bool in ATTRIBUTE_UNUSED)
7841 {
7842   return aarch64_tune_params.memmov_cost;
7843 }
7844
7845 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7846    to optimize 1.0/sqrt.  */
7847
7848 static bool
7849 use_rsqrt_p (machine_mode mode)
7850 {
7851   return (!flag_trapping_math
7852           && flag_unsafe_math_optimizations
7853           && ((aarch64_tune_params.approx_modes->recip_sqrt
7854                & AARCH64_APPROX_MODE (mode))
7855               || flag_mrecip_low_precision_sqrt));
7856 }
7857
7858 /* Function to decide when to use the approximate reciprocal square root
7859    builtin.  */
7860
7861 static tree
7862 aarch64_builtin_reciprocal (tree fndecl)
7863 {
7864   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7865
7866   if (!use_rsqrt_p (mode))
7867     return NULL_TREE;
7868   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7869 }
7870
7871 typedef rtx (*rsqrte_type) (rtx, rtx);
7872
7873 /* Select reciprocal square root initial estimate insn depending on machine
7874    mode.  */
7875
7876 static rsqrte_type
7877 get_rsqrte_type (machine_mode mode)
7878 {
7879   switch (mode)
7880   {
7881     case DFmode:   return gen_aarch64_rsqrtedf;
7882     case SFmode:   return gen_aarch64_rsqrtesf;
7883     case V2DFmode: return gen_aarch64_rsqrtev2df;
7884     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7885     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7886     default: gcc_unreachable ();
7887   }
7888 }
7889
7890 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7891
7892 /* Select reciprocal square root series step insn depending on machine mode.  */
7893
7894 static rsqrts_type
7895 get_rsqrts_type (machine_mode mode)
7896 {
7897   switch (mode)
7898   {
7899     case DFmode:   return gen_aarch64_rsqrtsdf;
7900     case SFmode:   return gen_aarch64_rsqrtssf;
7901     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7902     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7903     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7904     default: gcc_unreachable ();
7905   }
7906 }
7907
7908 /* Emit instruction sequence to compute either the approximate square root
7909    or its approximate reciprocal, depending on the flag RECP, and return
7910    whether the sequence was emitted or not.  */
7911
7912 bool
7913 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7914 {
7915   machine_mode mode = GET_MODE (dst);
7916
7917   if (GET_MODE_INNER (mode) == HFmode)
7918     return false;
7919
7920   machine_mode mmsk = mode_for_vector
7921                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7922                          GET_MODE_NUNITS (mode));
7923   bool use_approx_sqrt_p = (!recp
7924                             && (flag_mlow_precision_sqrt
7925                                 || (aarch64_tune_params.approx_modes->sqrt
7926                                     & AARCH64_APPROX_MODE (mode))));
7927   bool use_approx_rsqrt_p = (recp
7928                              && (flag_mrecip_low_precision_sqrt
7929                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7930                                      & AARCH64_APPROX_MODE (mode))));
7931
7932   if (!flag_finite_math_only
7933       || flag_trapping_math
7934       || !flag_unsafe_math_optimizations
7935       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7936       || optimize_function_for_size_p (cfun))
7937     return false;
7938
7939   rtx xmsk = gen_reg_rtx (mmsk);
7940   if (!recp)
7941     /* When calculating the approximate square root, compare the argument with
7942        0.0 and create a mask.  */
7943     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7944                                                           CONST0_RTX (mode)))));
7945
7946   /* Estimate the approximate reciprocal square root.  */
7947   rtx xdst = gen_reg_rtx (mode);
7948   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7949
7950   /* Iterate over the series twice for SF and thrice for DF.  */
7951   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7952
7953   /* Optionally iterate over the series once less for faster performance
7954      while sacrificing the accuracy.  */
7955   if ((recp && flag_mrecip_low_precision_sqrt)
7956       || (!recp && flag_mlow_precision_sqrt))
7957     iterations--;
7958
7959   /* Iterate over the series to calculate the approximate reciprocal square
7960      root.  */
7961   rtx x1 = gen_reg_rtx (mode);
7962   while (iterations--)
7963     {
7964       rtx x2 = gen_reg_rtx (mode);
7965       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7966
7967       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7968
7969       if (iterations > 0)
7970         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7971     }
7972
7973   if (!recp)
7974     {
7975       /* Qualify the approximate reciprocal square root when the argument is
7976          0.0 by squashing the intermediary result to 0.0.  */
7977       rtx xtmp = gen_reg_rtx (mmsk);
7978       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7979                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
7980       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7981
7982       /* Calculate the approximate square root.  */
7983       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7984     }
7985
7986   /* Finalize the approximation.  */
7987   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7988
7989   return true;
7990 }
7991
7992 typedef rtx (*recpe_type) (rtx, rtx);
7993
7994 /* Select reciprocal initial estimate insn depending on machine mode.  */
7995
7996 static recpe_type
7997 get_recpe_type (machine_mode mode)
7998 {
7999   switch (mode)
8000   {
8001     case SFmode:   return (gen_aarch64_frecpesf);
8002     case V2SFmode: return (gen_aarch64_frecpev2sf);
8003     case V4SFmode: return (gen_aarch64_frecpev4sf);
8004     case DFmode:   return (gen_aarch64_frecpedf);
8005     case V2DFmode: return (gen_aarch64_frecpev2df);
8006     default:       gcc_unreachable ();
8007   }
8008 }
8009
8010 typedef rtx (*recps_type) (rtx, rtx, rtx);
8011
8012 /* Select reciprocal series step insn depending on machine mode.  */
8013
8014 static recps_type
8015 get_recps_type (machine_mode mode)
8016 {
8017   switch (mode)
8018   {
8019     case SFmode:   return (gen_aarch64_frecpssf);
8020     case V2SFmode: return (gen_aarch64_frecpsv2sf);
8021     case V4SFmode: return (gen_aarch64_frecpsv4sf);
8022     case DFmode:   return (gen_aarch64_frecpsdf);
8023     case V2DFmode: return (gen_aarch64_frecpsv2df);
8024     default:       gcc_unreachable ();
8025   }
8026 }
8027
8028 /* Emit the instruction sequence to compute the approximation for the division
8029    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8030
8031 bool
8032 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8033 {
8034   machine_mode mode = GET_MODE (quo);
8035
8036   if (GET_MODE_INNER (mode) == HFmode)
8037     return false;
8038
8039   bool use_approx_division_p = (flag_mlow_precision_div
8040                                 || (aarch64_tune_params.approx_modes->division
8041                                     & AARCH64_APPROX_MODE (mode)));
8042
8043   if (!flag_finite_math_only
8044       || flag_trapping_math
8045       || !flag_unsafe_math_optimizations
8046       || optimize_function_for_size_p (cfun)
8047       || !use_approx_division_p)
8048     return false;
8049
8050   /* Estimate the approximate reciprocal.  */
8051   rtx xrcp = gen_reg_rtx (mode);
8052   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8053
8054   /* Iterate over the series twice for SF and thrice for DF.  */
8055   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8056
8057   /* Optionally iterate over the series once less for faster performance,
8058      while sacrificing the accuracy.  */
8059   if (flag_mlow_precision_div)
8060     iterations--;
8061
8062   /* Iterate over the series to calculate the approximate reciprocal.  */
8063   rtx xtmp = gen_reg_rtx (mode);
8064   while (iterations--)
8065     {
8066       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8067
8068       if (iterations > 0)
8069         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8070     }
8071
8072   if (num != CONST1_RTX (mode))
8073     {
8074       /* As the approximate reciprocal of DEN is already calculated, only
8075          calculate the approximate division when NUM is not 1.0.  */
8076       rtx xnum = force_reg (mode, num);
8077       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8078     }
8079
8080   /* Finalize the approximation.  */
8081   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8082   return true;
8083 }
8084
8085 /* Return the number of instructions that can be issued per cycle.  */
8086 static int
8087 aarch64_sched_issue_rate (void)
8088 {
8089   return aarch64_tune_params.issue_rate;
8090 }
8091
8092 static int
8093 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8094 {
8095   int issue_rate = aarch64_sched_issue_rate ();
8096
8097   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8098 }
8099
8100
8101 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8102    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8103    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8104
8105 static int
8106 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8107                                                     int ready_index)
8108 {
8109   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8110 }
8111
8112
8113 /* Vectorizer cost model target hooks.  */
8114
8115 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8116 static int
8117 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8118                                     tree vectype,
8119                                     int misalign ATTRIBUTE_UNUSED)
8120 {
8121   unsigned elements;
8122
8123   switch (type_of_cost)
8124     {
8125       case scalar_stmt:
8126         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
8127
8128       case scalar_load:
8129         return aarch64_tune_params.vec_costs->scalar_load_cost;
8130
8131       case scalar_store:
8132         return aarch64_tune_params.vec_costs->scalar_store_cost;
8133
8134       case vector_stmt:
8135         return aarch64_tune_params.vec_costs->vec_stmt_cost;
8136
8137       case vector_load:
8138         return aarch64_tune_params.vec_costs->vec_align_load_cost;
8139
8140       case vector_store:
8141         return aarch64_tune_params.vec_costs->vec_store_cost;
8142
8143       case vec_to_scalar:
8144         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
8145
8146       case scalar_to_vec:
8147         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
8148
8149       case unaligned_load:
8150         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
8151
8152       case unaligned_store:
8153         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
8154
8155       case cond_branch_taken:
8156         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
8157
8158       case cond_branch_not_taken:
8159         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
8160
8161       case vec_perm:
8162         return aarch64_tune_params.vec_costs->vec_permute_cost;
8163
8164       case vec_promote_demote:
8165         return aarch64_tune_params.vec_costs->vec_stmt_cost;
8166
8167       case vec_construct:
8168         elements = TYPE_VECTOR_SUBPARTS (vectype);
8169         return elements / 2 + 1;
8170
8171       default:
8172         gcc_unreachable ();
8173     }
8174 }
8175
8176 /* Implement targetm.vectorize.add_stmt_cost.  */
8177 static unsigned
8178 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8179                        struct _stmt_vec_info *stmt_info, int misalign,
8180                        enum vect_cost_model_location where)
8181 {
8182   unsigned *cost = (unsigned *) data;
8183   unsigned retval = 0;
8184
8185   if (flag_vect_cost_model)
8186     {
8187       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8188       int stmt_cost =
8189             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8190
8191       /* Statements in an inner loop relative to the loop being
8192          vectorized are weighted more heavily.  The value here is
8193          arbitrary and could potentially be improved with analysis.  */
8194       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8195         count *= 50; /*  FIXME  */
8196
8197       retval = (unsigned) (count * stmt_cost);
8198       cost[where] += retval;
8199     }
8200
8201   return retval;
8202 }
8203
8204 static void initialize_aarch64_code_model (struct gcc_options *);
8205
8206 /* Parse the TO_PARSE string and put the architecture struct that it
8207    selects into RES and the architectural features into ISA_FLAGS.
8208    Return an aarch64_parse_opt_result describing the parse result.
8209    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8210
8211 static enum aarch64_parse_opt_result
8212 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8213                     unsigned long *isa_flags)
8214 {
8215   char *ext;
8216   const struct processor *arch;
8217   char *str = (char *) alloca (strlen (to_parse) + 1);
8218   size_t len;
8219
8220   strcpy (str, to_parse);
8221
8222   ext = strchr (str, '+');
8223
8224   if (ext != NULL)
8225     len = ext - str;
8226   else
8227     len = strlen (str);
8228
8229   if (len == 0)
8230     return AARCH64_PARSE_MISSING_ARG;
8231
8232
8233   /* Loop through the list of supported ARCHes to find a match.  */
8234   for (arch = all_architectures; arch->name != NULL; arch++)
8235     {
8236       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8237         {
8238           unsigned long isa_temp = arch->flags;
8239
8240           if (ext != NULL)
8241             {
8242               /* TO_PARSE string contains at least one extension.  */
8243               enum aarch64_parse_opt_result ext_res
8244                 = aarch64_parse_extension (ext, &isa_temp);
8245
8246               if (ext_res != AARCH64_PARSE_OK)
8247                 return ext_res;
8248             }
8249           /* Extension parsing was successful.  Confirm the result
8250              arch and ISA flags.  */
8251           *res = arch;
8252           *isa_flags = isa_temp;
8253           return AARCH64_PARSE_OK;
8254         }
8255     }
8256
8257   /* ARCH name not found in list.  */
8258   return AARCH64_PARSE_INVALID_ARG;
8259 }
8260
8261 /* Parse the TO_PARSE string and put the result tuning in RES and the
8262    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8263    describing the parse result.  If there is an error parsing, RES and
8264    ISA_FLAGS are left unchanged.  */
8265
8266 static enum aarch64_parse_opt_result
8267 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8268                    unsigned long *isa_flags)
8269 {
8270   char *ext;
8271   const struct processor *cpu;
8272   char *str = (char *) alloca (strlen (to_parse) + 1);
8273   size_t len;
8274
8275   strcpy (str, to_parse);
8276
8277   ext = strchr (str, '+');
8278
8279   if (ext != NULL)
8280     len = ext - str;
8281   else
8282     len = strlen (str);
8283
8284   if (len == 0)
8285     return AARCH64_PARSE_MISSING_ARG;
8286
8287
8288   /* Loop through the list of supported CPUs to find a match.  */
8289   for (cpu = all_cores; cpu->name != NULL; cpu++)
8290     {
8291       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8292         {
8293           unsigned long isa_temp = cpu->flags;
8294
8295
8296           if (ext != NULL)
8297             {
8298               /* TO_PARSE string contains at least one extension.  */
8299               enum aarch64_parse_opt_result ext_res
8300                 = aarch64_parse_extension (ext, &isa_temp);
8301
8302               if (ext_res != AARCH64_PARSE_OK)
8303                 return ext_res;
8304             }
8305           /* Extension parsing was successfull.  Confirm the result
8306              cpu and ISA flags.  */
8307           *res = cpu;
8308           *isa_flags = isa_temp;
8309           return AARCH64_PARSE_OK;
8310         }
8311     }
8312
8313   /* CPU name not found in list.  */
8314   return AARCH64_PARSE_INVALID_ARG;
8315 }
8316
8317 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8318    Return an aarch64_parse_opt_result describing the parse result.
8319    If the parsing fails the RES does not change.  */
8320
8321 static enum aarch64_parse_opt_result
8322 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8323 {
8324   const struct processor *cpu;
8325   char *str = (char *) alloca (strlen (to_parse) + 1);
8326
8327   strcpy (str, to_parse);
8328
8329   /* Loop through the list of supported CPUs to find a match.  */
8330   for (cpu = all_cores; cpu->name != NULL; cpu++)
8331     {
8332       if (strcmp (cpu->name, str) == 0)
8333         {
8334           *res = cpu;
8335           return AARCH64_PARSE_OK;
8336         }
8337     }
8338
8339   /* CPU name not found in list.  */
8340   return AARCH64_PARSE_INVALID_ARG;
8341 }
8342
8343 /* Parse TOKEN, which has length LENGTH to see if it is an option
8344    described in FLAG.  If it is, return the index bit for that fusion type.
8345    If not, error (printing OPTION_NAME) and return zero.  */
8346
8347 static unsigned int
8348 aarch64_parse_one_option_token (const char *token,
8349                                 size_t length,
8350                                 const struct aarch64_flag_desc *flag,
8351                                 const char *option_name)
8352 {
8353   for (; flag->name != NULL; flag++)
8354     {
8355       if (length == strlen (flag->name)
8356           && !strncmp (flag->name, token, length))
8357         return flag->flag;
8358     }
8359
8360   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8361   return 0;
8362 }
8363
8364 /* Parse OPTION which is a comma-separated list of flags to enable.
8365    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8366    default state we inherit from the CPU tuning structures.  OPTION_NAME
8367    gives the top-level option we are parsing in the -moverride string,
8368    for use in error messages.  */
8369
8370 static unsigned int
8371 aarch64_parse_boolean_options (const char *option,
8372                                const struct aarch64_flag_desc *flags,
8373                                unsigned int initial_state,
8374                                const char *option_name)
8375 {
8376   const char separator = '.';
8377   const char* specs = option;
8378   const char* ntoken = option;
8379   unsigned int found_flags = initial_state;
8380
8381   while ((ntoken = strchr (specs, separator)))
8382     {
8383       size_t token_length = ntoken - specs;
8384       unsigned token_ops = aarch64_parse_one_option_token (specs,
8385                                                            token_length,
8386                                                            flags,
8387                                                            option_name);
8388       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8389          in the token stream, reset the supported operations.  So:
8390
8391            adrp+add.cmp+branch.none.adrp+add
8392
8393            would have the result of turning on only adrp+add fusion.  */
8394       if (!token_ops)
8395         found_flags = 0;
8396
8397       found_flags |= token_ops;
8398       specs = ++ntoken;
8399     }
8400
8401   /* We ended with a comma, print something.  */
8402   if (!(*specs))
8403     {
8404       error ("%s string ill-formed\n", option_name);
8405       return 0;
8406     }
8407
8408   /* We still have one more token to parse.  */
8409   size_t token_length = strlen (specs);
8410   unsigned token_ops = aarch64_parse_one_option_token (specs,
8411                                                        token_length,
8412                                                        flags,
8413                                                        option_name);
8414    if (!token_ops)
8415      found_flags = 0;
8416
8417   found_flags |= token_ops;
8418   return found_flags;
8419 }
8420
8421 /* Support for overriding instruction fusion.  */
8422
8423 static void
8424 aarch64_parse_fuse_string (const char *fuse_string,
8425                             struct tune_params *tune)
8426 {
8427   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8428                                                      aarch64_fusible_pairs,
8429                                                      tune->fusible_ops,
8430                                                      "fuse=");
8431 }
8432
8433 /* Support for overriding other tuning flags.  */
8434
8435 static void
8436 aarch64_parse_tune_string (const char *tune_string,
8437                             struct tune_params *tune)
8438 {
8439   tune->extra_tuning_flags
8440     = aarch64_parse_boolean_options (tune_string,
8441                                      aarch64_tuning_flags,
8442                                      tune->extra_tuning_flags,
8443                                      "tune=");
8444 }
8445
8446 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8447    we understand.  If it is, extract the option string and handoff to
8448    the appropriate function.  */
8449
8450 void
8451 aarch64_parse_one_override_token (const char* token,
8452                                   size_t length,
8453                                   struct tune_params *tune)
8454 {
8455   const struct aarch64_tuning_override_function *fn
8456     = aarch64_tuning_override_functions;
8457
8458   const char *option_part = strchr (token, '=');
8459   if (!option_part)
8460     {
8461       error ("tuning string missing in option (%s)", token);
8462       return;
8463     }
8464
8465   /* Get the length of the option name.  */
8466   length = option_part - token;
8467   /* Skip the '=' to get to the option string.  */
8468   option_part++;
8469
8470   for (; fn->name != NULL; fn++)
8471     {
8472       if (!strncmp (fn->name, token, length))
8473         {
8474           fn->parse_override (option_part, tune);
8475           return;
8476         }
8477     }
8478
8479   error ("unknown tuning option (%s)",token);
8480   return;
8481 }
8482
8483 /* A checking mechanism for the implementation of the tls size.  */
8484
8485 static void
8486 initialize_aarch64_tls_size (struct gcc_options *opts)
8487 {
8488   if (aarch64_tls_size == 0)
8489     aarch64_tls_size = 24;
8490
8491   switch (opts->x_aarch64_cmodel_var)
8492     {
8493     case AARCH64_CMODEL_TINY:
8494       /* Both the default and maximum TLS size allowed under tiny is 1M which
8495          needs two instructions to address, so we clamp the size to 24.  */
8496       if (aarch64_tls_size > 24)
8497         aarch64_tls_size = 24;
8498       break;
8499     case AARCH64_CMODEL_SMALL:
8500       /* The maximum TLS size allowed under small is 4G.  */
8501       if (aarch64_tls_size > 32)
8502         aarch64_tls_size = 32;
8503       break;
8504     case AARCH64_CMODEL_LARGE:
8505       /* The maximum TLS size allowed under large is 16E.
8506          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8507       if (aarch64_tls_size > 48)
8508         aarch64_tls_size = 48;
8509       break;
8510     default:
8511       gcc_unreachable ();
8512     }
8513
8514   return;
8515 }
8516
8517 /* Parse STRING looking for options in the format:
8518      string     :: option:string
8519      option     :: name=substring
8520      name       :: {a-z}
8521      substring  :: defined by option.  */
8522
8523 static void
8524 aarch64_parse_override_string (const char* input_string,
8525                                struct tune_params* tune)
8526 {
8527   const char separator = ':';
8528   size_t string_length = strlen (input_string) + 1;
8529   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8530   char *string = string_root;
8531   strncpy (string, input_string, string_length);
8532   string[string_length - 1] = '\0';
8533
8534   char* ntoken = string;
8535
8536   while ((ntoken = strchr (string, separator)))
8537     {
8538       size_t token_length = ntoken - string;
8539       /* Make this substring look like a string.  */
8540       *ntoken = '\0';
8541       aarch64_parse_one_override_token (string, token_length, tune);
8542       string = ++ntoken;
8543     }
8544
8545   /* One last option to parse.  */
8546   aarch64_parse_one_override_token (string, strlen (string), tune);
8547   free (string_root);
8548 }
8549
8550
8551 static void
8552 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8553 {
8554   /* The logic here is that if we are disabling all frame pointer generation
8555      then we do not need to disable leaf frame pointer generation as a
8556      separate operation.  But if we are *only* disabling leaf frame pointer
8557      generation then we set flag_omit_frame_pointer to true, but in
8558      aarch64_frame_pointer_required we return false only for leaf functions.
8559
8560      PR 70044: We have to be careful about being called multiple times for the
8561      same function.  Once we have decided to set flag_omit_frame_pointer just
8562      so that we can omit leaf frame pointers, we must then not interpret a
8563      second call as meaning that all frame pointer generation should be
8564      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8565      non-zero value.  */
8566   if (opts->x_flag_omit_frame_pointer == 2)
8567     opts->x_flag_omit_frame_pointer = 0;
8568
8569   if (opts->x_flag_omit_frame_pointer)
8570     opts->x_flag_omit_leaf_frame_pointer = false;
8571   else if (opts->x_flag_omit_leaf_frame_pointer)
8572     opts->x_flag_omit_frame_pointer = 2;
8573
8574   /* If not optimizing for size, set the default
8575      alignment to what the target wants.  */
8576   if (!opts->x_optimize_size)
8577     {
8578       if (opts->x_align_loops <= 0)
8579         opts->x_align_loops = aarch64_tune_params.loop_align;
8580       if (opts->x_align_jumps <= 0)
8581         opts->x_align_jumps = aarch64_tune_params.jump_align;
8582       if (opts->x_align_functions <= 0)
8583         opts->x_align_functions = aarch64_tune_params.function_align;
8584     }
8585
8586   /* We default to no pc-relative literal loads.  */
8587
8588   aarch64_pcrelative_literal_loads = false;
8589
8590   /* If -mpc-relative-literal-loads is set on the command line, this
8591      implies that the user asked for PC relative literal loads.  */
8592   if (opts->x_pcrelative_literal_loads == 1)
8593     aarch64_pcrelative_literal_loads = true;
8594
8595   /* This is PR70113. When building the Linux kernel with
8596      CONFIG_ARM64_ERRATUM_843419, support for relocations
8597      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8598      removed from the kernel to avoid loading objects with possibly
8599      offending sequences.  Without -mpc-relative-literal-loads we would
8600      generate such relocations, preventing the kernel build from
8601      succeeding.  */
8602   if (opts->x_pcrelative_literal_loads == 2
8603       && TARGET_FIX_ERR_A53_843419)
8604     aarch64_pcrelative_literal_loads = true;
8605
8606   /* In the tiny memory model it makes no sense to disallow PC relative
8607      literal pool loads.  */
8608   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8609       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8610     aarch64_pcrelative_literal_loads = true;
8611
8612   /* When enabling the lower precision Newton series for the square root, also
8613      enable it for the reciprocal square root, since the latter is an
8614      intermediary step for the former.  */
8615   if (flag_mlow_precision_sqrt)
8616     flag_mrecip_low_precision_sqrt = true;
8617 }
8618
8619 /* 'Unpack' up the internal tuning structs and update the options
8620     in OPTS.  The caller must have set up selected_tune and selected_arch
8621     as all the other target-specific codegen decisions are
8622     derived from them.  */
8623
8624 void
8625 aarch64_override_options_internal (struct gcc_options *opts)
8626 {
8627   aarch64_tune_flags = selected_tune->flags;
8628   aarch64_tune = selected_tune->sched_core;
8629   /* Make a copy of the tuning parameters attached to the core, which
8630      we may later overwrite.  */
8631   aarch64_tune_params = *(selected_tune->tune);
8632   aarch64_architecture_version = selected_arch->architecture_version;
8633
8634   if (opts->x_aarch64_override_tune_string)
8635     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8636                                   &aarch64_tune_params);
8637
8638   /* This target defaults to strict volatile bitfields.  */
8639   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8640     opts->x_flag_strict_volatile_bitfields = 1;
8641
8642   initialize_aarch64_code_model (opts);
8643   initialize_aarch64_tls_size (opts);
8644
8645   int queue_depth = 0;
8646   switch (aarch64_tune_params.autoprefetcher_model)
8647     {
8648       case tune_params::AUTOPREFETCHER_OFF:
8649         queue_depth = -1;
8650         break;
8651       case tune_params::AUTOPREFETCHER_WEAK:
8652         queue_depth = 0;
8653         break;
8654       case tune_params::AUTOPREFETCHER_STRONG:
8655         queue_depth = max_insn_queue_index + 1;
8656         break;
8657       default:
8658         gcc_unreachable ();
8659     }
8660
8661   /* We don't mind passing in global_options_set here as we don't use
8662      the *options_set structs anyway.  */
8663   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8664                          queue_depth,
8665                          opts->x_param_values,
8666                          global_options_set.x_param_values);
8667
8668   /* Set the L1 cache line size.  */
8669   if (selected_cpu->tune->cache_line_size != 0)
8670     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8671                            selected_cpu->tune->cache_line_size,
8672                            opts->x_param_values,
8673                            global_options_set.x_param_values);
8674
8675   aarch64_override_options_after_change_1 (opts);
8676 }
8677
8678 /* Print a hint with a suggestion for a core or architecture name that
8679    most closely resembles what the user passed in STR.  ARCH is true if
8680    the user is asking for an architecture name.  ARCH is false if the user
8681    is asking for a core name.  */
8682
8683 static void
8684 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8685 {
8686   auto_vec<const char *> candidates;
8687   const struct processor *entry = arch ? all_architectures : all_cores;
8688   for (; entry->name != NULL; entry++)
8689     candidates.safe_push (entry->name);
8690   char *s;
8691   const char *hint = candidates_list_and_hint (str, s, candidates);
8692   if (hint)
8693     inform (input_location, "valid arguments are: %s;"
8694                              " did you mean %qs?", s, hint);
8695   XDELETEVEC (s);
8696 }
8697
8698 /* Print a hint with a suggestion for a core name that most closely resembles
8699    what the user passed in STR.  */
8700
8701 inline static void
8702 aarch64_print_hint_for_core (const char *str)
8703 {
8704   aarch64_print_hint_for_core_or_arch (str, false);
8705 }
8706
8707 /* Print a hint with a suggestion for an architecture name that most closely
8708    resembles what the user passed in STR.  */
8709
8710 inline static void
8711 aarch64_print_hint_for_arch (const char *str)
8712 {
8713   aarch64_print_hint_for_core_or_arch (str, true);
8714 }
8715
8716 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8717    specified in STR and throw errors if appropriate.  Put the results if
8718    they are valid in RES and ISA_FLAGS.  Return whether the option is
8719    valid.  */
8720
8721 static bool
8722 aarch64_validate_mcpu (const char *str, const struct processor **res,
8723                        unsigned long *isa_flags)
8724 {
8725   enum aarch64_parse_opt_result parse_res
8726     = aarch64_parse_cpu (str, res, isa_flags);
8727
8728   if (parse_res == AARCH64_PARSE_OK)
8729     return true;
8730
8731   switch (parse_res)
8732     {
8733       case AARCH64_PARSE_MISSING_ARG:
8734         error ("missing cpu name in -mcpu=%qs", str);
8735         break;
8736       case AARCH64_PARSE_INVALID_ARG:
8737         error ("unknown value %qs for -mcpu", str);
8738         aarch64_print_hint_for_core (str);
8739         break;
8740       case AARCH64_PARSE_INVALID_FEATURE:
8741         error ("invalid feature modifier in -mcpu=%qs", str);
8742         break;
8743       default:
8744         gcc_unreachable ();
8745     }
8746
8747   return false;
8748 }
8749
8750 /* Validate a command-line -march option.  Parse the arch and extensions
8751    (if any) specified in STR and throw errors if appropriate.  Put the
8752    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8753    option is valid.  */
8754
8755 static bool
8756 aarch64_validate_march (const char *str, const struct processor **res,
8757                          unsigned long *isa_flags)
8758 {
8759   enum aarch64_parse_opt_result parse_res
8760     = aarch64_parse_arch (str, res, isa_flags);
8761
8762   if (parse_res == AARCH64_PARSE_OK)
8763     return true;
8764
8765   switch (parse_res)
8766     {
8767       case AARCH64_PARSE_MISSING_ARG:
8768         error ("missing arch name in -march=%qs", str);
8769         break;
8770       case AARCH64_PARSE_INVALID_ARG:
8771         error ("unknown value %qs for -march", str);
8772         aarch64_print_hint_for_arch (str);
8773         break;
8774       case AARCH64_PARSE_INVALID_FEATURE:
8775         error ("invalid feature modifier in -march=%qs", str);
8776         break;
8777       default:
8778         gcc_unreachable ();
8779     }
8780
8781   return false;
8782 }
8783
8784 /* Validate a command-line -mtune option.  Parse the cpu
8785    specified in STR and throw errors if appropriate.  Put the
8786    result, if it is valid, in RES.  Return whether the option is
8787    valid.  */
8788
8789 static bool
8790 aarch64_validate_mtune (const char *str, const struct processor **res)
8791 {
8792   enum aarch64_parse_opt_result parse_res
8793     = aarch64_parse_tune (str, res);
8794
8795   if (parse_res == AARCH64_PARSE_OK)
8796     return true;
8797
8798   switch (parse_res)
8799     {
8800       case AARCH64_PARSE_MISSING_ARG:
8801         error ("missing cpu name in -mtune=%qs", str);
8802         break;
8803       case AARCH64_PARSE_INVALID_ARG:
8804         error ("unknown value %qs for -mtune", str);
8805         aarch64_print_hint_for_core (str);
8806         break;
8807       default:
8808         gcc_unreachable ();
8809     }
8810   return false;
8811 }
8812
8813 /* Return the CPU corresponding to the enum CPU.
8814    If it doesn't specify a cpu, return the default.  */
8815
8816 static const struct processor *
8817 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8818 {
8819   if (cpu != aarch64_none)
8820     return &all_cores[cpu];
8821
8822   /* The & 0x3f is to extract the bottom 6 bits that encode the
8823      default cpu as selected by the --with-cpu GCC configure option
8824      in config.gcc.
8825      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8826      flags mechanism should be reworked to make it more sane.  */
8827   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8828 }
8829
8830 /* Return the architecture corresponding to the enum ARCH.
8831    If it doesn't specify a valid architecture, return the default.  */
8832
8833 static const struct processor *
8834 aarch64_get_arch (enum aarch64_arch arch)
8835 {
8836   if (arch != aarch64_no_arch)
8837     return &all_architectures[arch];
8838
8839   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8840
8841   return &all_architectures[cpu->arch];
8842 }
8843
8844 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8845    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8846    tuning structs.  In particular it must set selected_tune and
8847    aarch64_isa_flags that define the available ISA features and tuning
8848    decisions.  It must also set selected_arch as this will be used to
8849    output the .arch asm tags for each function.  */
8850
8851 static void
8852 aarch64_override_options (void)
8853 {
8854   unsigned long cpu_isa = 0;
8855   unsigned long arch_isa = 0;
8856   aarch64_isa_flags = 0;
8857
8858   bool valid_cpu = true;
8859   bool valid_tune = true;
8860   bool valid_arch = true;
8861
8862   selected_cpu = NULL;
8863   selected_arch = NULL;
8864   selected_tune = NULL;
8865
8866   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8867      If either of -march or -mtune is given, they override their
8868      respective component of -mcpu.  */
8869   if (aarch64_cpu_string)
8870     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8871                                         &cpu_isa);
8872
8873   if (aarch64_arch_string)
8874     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8875                                           &arch_isa);
8876
8877   if (aarch64_tune_string)
8878     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8879
8880   /* If the user did not specify a processor, choose the default
8881      one for them.  This will be the CPU set during configuration using
8882      --with-cpu, otherwise it is "generic".  */
8883   if (!selected_cpu)
8884     {
8885       if (selected_arch)
8886         {
8887           selected_cpu = &all_cores[selected_arch->ident];
8888           aarch64_isa_flags = arch_isa;
8889           explicit_arch = selected_arch->arch;
8890         }
8891       else
8892         {
8893           /* Get default configure-time CPU.  */
8894           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8895           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8896         }
8897
8898       if (selected_tune)
8899         explicit_tune_core = selected_tune->ident;
8900     }
8901   /* If both -mcpu and -march are specified check that they are architecturally
8902      compatible, warn if they're not and prefer the -march ISA flags.  */
8903   else if (selected_arch)
8904     {
8905       if (selected_arch->arch != selected_cpu->arch)
8906         {
8907           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8908                        all_architectures[selected_cpu->arch].name,
8909                        selected_arch->name);
8910         }
8911       aarch64_isa_flags = arch_isa;
8912       explicit_arch = selected_arch->arch;
8913       explicit_tune_core = selected_tune ? selected_tune->ident
8914                                           : selected_cpu->ident;
8915     }
8916   else
8917     {
8918       /* -mcpu but no -march.  */
8919       aarch64_isa_flags = cpu_isa;
8920       explicit_tune_core = selected_tune ? selected_tune->ident
8921                                           : selected_cpu->ident;
8922       gcc_assert (selected_cpu);
8923       selected_arch = &all_architectures[selected_cpu->arch];
8924       explicit_arch = selected_arch->arch;
8925     }
8926
8927   /* Set the arch as well as we will need it when outputing
8928      the .arch directive in assembly.  */
8929   if (!selected_arch)
8930     {
8931       gcc_assert (selected_cpu);
8932       selected_arch = &all_architectures[selected_cpu->arch];
8933     }
8934
8935   if (!selected_tune)
8936     selected_tune = selected_cpu;
8937
8938 #ifndef HAVE_AS_MABI_OPTION
8939   /* The compiler may have been configured with 2.23.* binutils, which does
8940      not have support for ILP32.  */
8941   if (TARGET_ILP32)
8942     error ("Assembler does not support -mabi=ilp32");
8943 #endif
8944
8945   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
8946     sorry ("Return address signing is only supported for -mabi=lp64");
8947
8948   /* Make sure we properly set up the explicit options.  */
8949   if ((aarch64_cpu_string && valid_cpu)
8950        || (aarch64_tune_string && valid_tune))
8951     gcc_assert (explicit_tune_core != aarch64_none);
8952
8953   if ((aarch64_cpu_string && valid_cpu)
8954        || (aarch64_arch_string && valid_arch))
8955     gcc_assert (explicit_arch != aarch64_no_arch);
8956
8957   aarch64_override_options_internal (&global_options);
8958
8959   /* Save these options as the default ones in case we push and pop them later
8960      while processing functions with potential target attributes.  */
8961   target_option_default_node = target_option_current_node
8962       = build_target_option_node (&global_options);
8963 }
8964
8965 /* Implement targetm.override_options_after_change.  */
8966
8967 static void
8968 aarch64_override_options_after_change (void)
8969 {
8970   aarch64_override_options_after_change_1 (&global_options);
8971 }
8972
8973 static struct machine_function *
8974 aarch64_init_machine_status (void)
8975 {
8976   struct machine_function *machine;
8977   machine = ggc_cleared_alloc<machine_function> ();
8978   return machine;
8979 }
8980
8981 void
8982 aarch64_init_expanders (void)
8983 {
8984   init_machine_status = aarch64_init_machine_status;
8985 }
8986
8987 /* A checking mechanism for the implementation of the various code models.  */
8988 static void
8989 initialize_aarch64_code_model (struct gcc_options *opts)
8990 {
8991    if (opts->x_flag_pic)
8992      {
8993        switch (opts->x_aarch64_cmodel_var)
8994          {
8995          case AARCH64_CMODEL_TINY:
8996            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8997            break;
8998          case AARCH64_CMODEL_SMALL:
8999 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9000            aarch64_cmodel = (flag_pic == 2
9001                              ? AARCH64_CMODEL_SMALL_PIC
9002                              : AARCH64_CMODEL_SMALL_SPIC);
9003 #else
9004            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9005 #endif
9006            break;
9007          case AARCH64_CMODEL_LARGE:
9008            sorry ("code model %qs with -f%s", "large",
9009                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9010            break;
9011          default:
9012            gcc_unreachable ();
9013          }
9014      }
9015    else
9016      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9017 }
9018
9019 /* Implement TARGET_OPTION_SAVE.  */
9020
9021 static void
9022 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9023 {
9024   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9025 }
9026
9027 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9028    using the information saved in PTR.  */
9029
9030 static void
9031 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9032 {
9033   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9034   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9035   opts->x_explicit_arch = ptr->x_explicit_arch;
9036   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9037   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9038
9039   aarch64_override_options_internal (opts);
9040 }
9041
9042 /* Implement TARGET_OPTION_PRINT.  */
9043
9044 static void
9045 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9046 {
9047   const struct processor *cpu
9048     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9049   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9050   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9051   std::string extension
9052     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9053
9054   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9055   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9056            arch->name, extension.c_str ());
9057 }
9058
9059 static GTY(()) tree aarch64_previous_fndecl;
9060
9061 void
9062 aarch64_reset_previous_fndecl (void)
9063 {
9064   aarch64_previous_fndecl = NULL;
9065 }
9066
9067 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9068    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9069    make sure optab availability predicates are recomputed when necessary.  */
9070
9071 void
9072 aarch64_save_restore_target_globals (tree new_tree)
9073 {
9074   if (TREE_TARGET_GLOBALS (new_tree))
9075     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9076   else if (new_tree == target_option_default_node)
9077     restore_target_globals (&default_target_globals);
9078   else
9079     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9080 }
9081
9082 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9083    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9084    of the function, if such exists.  This function may be called multiple
9085    times on a single function so use aarch64_previous_fndecl to avoid
9086    setting up identical state.  */
9087
9088 static void
9089 aarch64_set_current_function (tree fndecl)
9090 {
9091   if (!fndecl || fndecl == aarch64_previous_fndecl)
9092     return;
9093
9094   tree old_tree = (aarch64_previous_fndecl
9095                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9096                    : NULL_TREE);
9097
9098   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9099
9100   /* If current function has no attributes but the previous one did,
9101      use the default node.  */
9102   if (!new_tree && old_tree)
9103     new_tree = target_option_default_node;
9104
9105   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9106      the default have been handled by aarch64_save_restore_target_globals from
9107      aarch64_pragma_target_parse.  */
9108   if (old_tree == new_tree)
9109     return;
9110
9111   aarch64_previous_fndecl = fndecl;
9112
9113   /* First set the target options.  */
9114   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9115
9116   aarch64_save_restore_target_globals (new_tree);
9117 }
9118
9119 /* Enum describing the various ways we can handle attributes.
9120    In many cases we can reuse the generic option handling machinery.  */
9121
9122 enum aarch64_attr_opt_type
9123 {
9124   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9125   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9126   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9127   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9128 };
9129
9130 /* All the information needed to handle a target attribute.
9131    NAME is the name of the attribute.
9132    ATTR_TYPE specifies the type of behavior of the attribute as described
9133    in the definition of enum aarch64_attr_opt_type.
9134    ALLOW_NEG is true if the attribute supports a "no-" form.
9135    HANDLER is the function that takes the attribute string and whether
9136    it is a pragma or attribute and handles the option.  It is needed only
9137    when the ATTR_TYPE is aarch64_attr_custom.
9138    OPT_NUM is the enum specifying the option that the attribute modifies.
9139    This is needed for attributes that mirror the behavior of a command-line
9140    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9141    aarch64_attr_enum.  */
9142
9143 struct aarch64_attribute_info
9144 {
9145   const char *name;
9146   enum aarch64_attr_opt_type attr_type;
9147   bool allow_neg;
9148   bool (*handler) (const char *, const char *);
9149   enum opt_code opt_num;
9150 };
9151
9152 /* Handle the ARCH_STR argument to the arch= target attribute.
9153    PRAGMA_OR_ATTR is used in potential error messages.  */
9154
9155 static bool
9156 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9157 {
9158   const struct processor *tmp_arch = NULL;
9159   enum aarch64_parse_opt_result parse_res
9160     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9161
9162   if (parse_res == AARCH64_PARSE_OK)
9163     {
9164       gcc_assert (tmp_arch);
9165       selected_arch = tmp_arch;
9166       explicit_arch = selected_arch->arch;
9167       return true;
9168     }
9169
9170   switch (parse_res)
9171     {
9172       case AARCH64_PARSE_MISSING_ARG:
9173         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9174         break;
9175       case AARCH64_PARSE_INVALID_ARG:
9176         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9177         aarch64_print_hint_for_arch (str);
9178         break;
9179       case AARCH64_PARSE_INVALID_FEATURE:
9180         error ("invalid feature modifier %qs for 'arch' target %s",
9181                str, pragma_or_attr);
9182         break;
9183       default:
9184         gcc_unreachable ();
9185     }
9186
9187   return false;
9188 }
9189
9190 /* Handle the argument CPU_STR to the cpu= target attribute.
9191    PRAGMA_OR_ATTR is used in potential error messages.  */
9192
9193 static bool
9194 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9195 {
9196   const struct processor *tmp_cpu = NULL;
9197   enum aarch64_parse_opt_result parse_res
9198     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9199
9200   if (parse_res == AARCH64_PARSE_OK)
9201     {
9202       gcc_assert (tmp_cpu);
9203       selected_tune = tmp_cpu;
9204       explicit_tune_core = selected_tune->ident;
9205
9206       selected_arch = &all_architectures[tmp_cpu->arch];
9207       explicit_arch = selected_arch->arch;
9208       return true;
9209     }
9210
9211   switch (parse_res)
9212     {
9213       case AARCH64_PARSE_MISSING_ARG:
9214         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9215         break;
9216       case AARCH64_PARSE_INVALID_ARG:
9217         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9218         aarch64_print_hint_for_core (str);
9219         break;
9220       case AARCH64_PARSE_INVALID_FEATURE:
9221         error ("invalid feature modifier %qs for 'cpu' target %s",
9222                str, pragma_or_attr);
9223         break;
9224       default:
9225         gcc_unreachable ();
9226     }
9227
9228   return false;
9229 }
9230
9231 /* Handle the argument STR to the tune= target attribute.
9232    PRAGMA_OR_ATTR is used in potential error messages.  */
9233
9234 static bool
9235 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9236 {
9237   const struct processor *tmp_tune = NULL;
9238   enum aarch64_parse_opt_result parse_res
9239     = aarch64_parse_tune (str, &tmp_tune);
9240
9241   if (parse_res == AARCH64_PARSE_OK)
9242     {
9243       gcc_assert (tmp_tune);
9244       selected_tune = tmp_tune;
9245       explicit_tune_core = selected_tune->ident;
9246       return true;
9247     }
9248
9249   switch (parse_res)
9250     {
9251       case AARCH64_PARSE_INVALID_ARG:
9252         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9253         aarch64_print_hint_for_core (str);
9254         break;
9255       default:
9256         gcc_unreachable ();
9257     }
9258
9259   return false;
9260 }
9261
9262 /* Parse an architecture extensions target attribute string specified in STR.
9263    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9264    if successful.  Update aarch64_isa_flags to reflect the ISA features
9265    modified.
9266    PRAGMA_OR_ATTR is used in potential error messages.  */
9267
9268 static bool
9269 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9270 {
9271   enum aarch64_parse_opt_result parse_res;
9272   unsigned long isa_flags = aarch64_isa_flags;
9273
9274   /* We allow "+nothing" in the beginning to clear out all architectural
9275      features if the user wants to handpick specific features.  */
9276   if (strncmp ("+nothing", str, 8) == 0)
9277     {
9278       isa_flags = 0;
9279       str += 8;
9280     }
9281
9282   parse_res = aarch64_parse_extension (str, &isa_flags);
9283
9284   if (parse_res == AARCH64_PARSE_OK)
9285     {
9286       aarch64_isa_flags = isa_flags;
9287       return true;
9288     }
9289
9290   switch (parse_res)
9291     {
9292       case AARCH64_PARSE_MISSING_ARG:
9293         error ("missing feature modifier in target %s %qs",
9294                pragma_or_attr, str);
9295         break;
9296
9297       case AARCH64_PARSE_INVALID_FEATURE:
9298         error ("invalid feature modifier in target %s %qs",
9299                pragma_or_attr, str);
9300         break;
9301
9302       default:
9303         gcc_unreachable ();
9304     }
9305
9306  return false;
9307 }
9308
9309 /* The target attributes that we support.  On top of these we also support just
9310    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9311    handled explicitly in aarch64_process_one_target_attr.  */
9312
9313 static const struct aarch64_attribute_info aarch64_attributes[] =
9314 {
9315   { "general-regs-only", aarch64_attr_mask, false, NULL,
9316      OPT_mgeneral_regs_only },
9317   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9318      OPT_mfix_cortex_a53_835769 },
9319   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9320      OPT_mfix_cortex_a53_843419 },
9321   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9322   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9323   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9324      OPT_momit_leaf_frame_pointer },
9325   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9326   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9327      OPT_march_ },
9328   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9329   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9330      OPT_mtune_ },
9331   { "sign-return-address", aarch64_attr_enum, false, NULL,
9332      OPT_msign_return_address_ },
9333   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9334 };
9335
9336 /* Parse ARG_STR which contains the definition of one target attribute.
9337    Show appropriate errors if any or return true if the attribute is valid.
9338    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9339    we're processing a target attribute or pragma.  */
9340
9341 static bool
9342 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9343 {
9344   bool invert = false;
9345
9346   size_t len = strlen (arg_str);
9347
9348   if (len == 0)
9349     {
9350       error ("malformed target %s", pragma_or_attr);
9351       return false;
9352     }
9353
9354   char *str_to_check = (char *) alloca (len + 1);
9355   strcpy (str_to_check, arg_str);
9356
9357   /* Skip leading whitespace.  */
9358   while (*str_to_check == ' ' || *str_to_check == '\t')
9359     str_to_check++;
9360
9361   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9362      It is easier to detect and handle it explicitly here rather than going
9363      through the machinery for the rest of the target attributes in this
9364      function.  */
9365   if (*str_to_check == '+')
9366     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9367
9368   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9369     {
9370       invert = true;
9371       str_to_check += 3;
9372     }
9373   char *arg = strchr (str_to_check, '=');
9374
9375   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9376      and point ARG to "foo".  */
9377   if (arg)
9378     {
9379       *arg = '\0';
9380       arg++;
9381     }
9382   const struct aarch64_attribute_info *p_attr;
9383   bool found = false;
9384   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9385     {
9386       /* If the names don't match up, or the user has given an argument
9387          to an attribute that doesn't accept one, or didn't give an argument
9388          to an attribute that expects one, fail to match.  */
9389       if (strcmp (str_to_check, p_attr->name) != 0)
9390         continue;
9391
9392       found = true;
9393       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9394                               || p_attr->attr_type == aarch64_attr_enum;
9395
9396       if (attr_need_arg_p ^ (arg != NULL))
9397         {
9398           error ("target %s %qs does not accept an argument",
9399                   pragma_or_attr, str_to_check);
9400           return false;
9401         }
9402
9403       /* If the name matches but the attribute does not allow "no-" versions
9404          then we can't match.  */
9405       if (invert && !p_attr->allow_neg)
9406         {
9407           error ("target %s %qs does not allow a negated form",
9408                   pragma_or_attr, str_to_check);
9409           return false;
9410         }
9411
9412       switch (p_attr->attr_type)
9413         {
9414         /* Has a custom handler registered.
9415            For example, cpu=, arch=, tune=.  */
9416           case aarch64_attr_custom:
9417             gcc_assert (p_attr->handler);
9418             if (!p_attr->handler (arg, pragma_or_attr))
9419               return false;
9420             break;
9421
9422           /* Either set or unset a boolean option.  */
9423           case aarch64_attr_bool:
9424             {
9425               struct cl_decoded_option decoded;
9426
9427               generate_option (p_attr->opt_num, NULL, !invert,
9428                                CL_TARGET, &decoded);
9429               aarch64_handle_option (&global_options, &global_options_set,
9430                                       &decoded, input_location);
9431               break;
9432             }
9433           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9434              should know what mask to apply given the option number.  */
9435           case aarch64_attr_mask:
9436             {
9437               struct cl_decoded_option decoded;
9438               /* We only need to specify the option number.
9439                  aarch64_handle_option will know which mask to apply.  */
9440               decoded.opt_index = p_attr->opt_num;
9441               decoded.value = !invert;
9442               aarch64_handle_option (&global_options, &global_options_set,
9443                                       &decoded, input_location);
9444               break;
9445             }
9446           /* Use the option setting machinery to set an option to an enum.  */
9447           case aarch64_attr_enum:
9448             {
9449               gcc_assert (arg);
9450               bool valid;
9451               int value;
9452               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9453                                               &value, CL_TARGET);
9454               if (valid)
9455                 {
9456                   set_option (&global_options, NULL, p_attr->opt_num, value,
9457                               NULL, DK_UNSPECIFIED, input_location,
9458                               global_dc);
9459                 }
9460               else
9461                 {
9462                   error ("target %s %s=%s is not valid",
9463                          pragma_or_attr, str_to_check, arg);
9464                 }
9465               break;
9466             }
9467           default:
9468             gcc_unreachable ();
9469         }
9470     }
9471
9472   /* If we reached here we either have found an attribute and validated
9473      it or didn't match any.  If we matched an attribute but its arguments
9474      were malformed we will have returned false already.  */
9475   return found;
9476 }
9477
9478 /* Count how many times the character C appears in
9479    NULL-terminated string STR.  */
9480
9481 static unsigned int
9482 num_occurences_in_str (char c, char *str)
9483 {
9484   unsigned int res = 0;
9485   while (*str != '\0')
9486     {
9487       if (*str == c)
9488         res++;
9489
9490       str++;
9491     }
9492
9493   return res;
9494 }
9495
9496 /* Parse the tree in ARGS that contains the target attribute information
9497    and update the global target options space.  PRAGMA_OR_ATTR is a string
9498    to be used in error messages, specifying whether this is processing
9499    a target attribute or a target pragma.  */
9500
9501 bool
9502 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9503 {
9504   if (TREE_CODE (args) == TREE_LIST)
9505     {
9506       do
9507         {
9508           tree head = TREE_VALUE (args);
9509           if (head)
9510             {
9511               if (!aarch64_process_target_attr (head, pragma_or_attr))
9512                 return false;
9513             }
9514           args = TREE_CHAIN (args);
9515         } while (args);
9516
9517       return true;
9518     }
9519   /* We expect to find a string to parse.  */
9520   gcc_assert (TREE_CODE (args) == STRING_CST);
9521
9522   size_t len = strlen (TREE_STRING_POINTER (args));
9523   char *str_to_check = (char *) alloca (len + 1);
9524   strcpy (str_to_check, TREE_STRING_POINTER (args));
9525
9526   if (len == 0)
9527     {
9528       error ("malformed target %s value", pragma_or_attr);
9529       return false;
9530     }
9531
9532   /* Used to catch empty spaces between commas i.e.
9533      attribute ((target ("attr1,,attr2"))).  */
9534   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9535
9536   /* Handle multiple target attributes separated by ','.  */
9537   char *token = strtok (str_to_check, ",");
9538
9539   unsigned int num_attrs = 0;
9540   while (token)
9541     {
9542       num_attrs++;
9543       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9544         {
9545           error ("target %s %qs is invalid", pragma_or_attr, token);
9546           return false;
9547         }
9548
9549       token = strtok (NULL, ",");
9550     }
9551
9552   if (num_attrs != num_commas + 1)
9553     {
9554       error ("malformed target %s list %qs",
9555               pragma_or_attr, TREE_STRING_POINTER (args));
9556       return false;
9557     }
9558
9559   return true;
9560 }
9561
9562 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9563    process attribute ((target ("..."))).  */
9564
9565 static bool
9566 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9567 {
9568   struct cl_target_option cur_target;
9569   bool ret;
9570   tree old_optimize;
9571   tree new_target, new_optimize;
9572   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9573
9574   /* If what we're processing is the current pragma string then the
9575      target option node is already stored in target_option_current_node
9576      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9577      having to re-parse the string.  This is especially useful to keep
9578      arm_neon.h compile times down since that header contains a lot
9579      of intrinsics enclosed in pragmas.  */
9580   if (!existing_target && args == current_target_pragma)
9581     {
9582       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9583       return true;
9584     }
9585   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9586
9587   old_optimize = build_optimization_node (&global_options);
9588   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9589
9590   /* If the function changed the optimization levels as well as setting
9591      target options, start with the optimizations specified.  */
9592   if (func_optimize && func_optimize != old_optimize)
9593     cl_optimization_restore (&global_options,
9594                              TREE_OPTIMIZATION (func_optimize));
9595
9596   /* Save the current target options to restore at the end.  */
9597   cl_target_option_save (&cur_target, &global_options);
9598
9599   /* If fndecl already has some target attributes applied to it, unpack
9600      them so that we add this attribute on top of them, rather than
9601      overwriting them.  */
9602   if (existing_target)
9603     {
9604       struct cl_target_option *existing_options
9605         = TREE_TARGET_OPTION (existing_target);
9606
9607       if (existing_options)
9608         cl_target_option_restore (&global_options, existing_options);
9609     }
9610   else
9611     cl_target_option_restore (&global_options,
9612                         TREE_TARGET_OPTION (target_option_current_node));
9613
9614
9615   ret = aarch64_process_target_attr (args, "attribute");
9616
9617   /* Set up any additional state.  */
9618   if (ret)
9619     {
9620       aarch64_override_options_internal (&global_options);
9621       /* Initialize SIMD builtins if we haven't already.
9622          Set current_target_pragma to NULL for the duration so that
9623          the builtin initialization code doesn't try to tag the functions
9624          being built with the attributes specified by any current pragma, thus
9625          going into an infinite recursion.  */
9626       if (TARGET_SIMD)
9627         {
9628           tree saved_current_target_pragma = current_target_pragma;
9629           current_target_pragma = NULL;
9630           aarch64_init_simd_builtins ();
9631           current_target_pragma = saved_current_target_pragma;
9632         }
9633       new_target = build_target_option_node (&global_options);
9634     }
9635   else
9636     new_target = NULL;
9637
9638   new_optimize = build_optimization_node (&global_options);
9639
9640   if (fndecl && ret)
9641     {
9642       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9643
9644       if (old_optimize != new_optimize)
9645         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9646     }
9647
9648   cl_target_option_restore (&global_options, &cur_target);
9649
9650   if (old_optimize != new_optimize)
9651     cl_optimization_restore (&global_options,
9652                              TREE_OPTIMIZATION (old_optimize));
9653   return ret;
9654 }
9655
9656 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9657    tri-bool options (yes, no, don't care) and the default value is
9658    DEF, determine whether to reject inlining.  */
9659
9660 static bool
9661 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9662                                      int dont_care, int def)
9663 {
9664   /* If the callee doesn't care, always allow inlining.  */
9665   if (callee == dont_care)
9666     return true;
9667
9668   /* If the caller doesn't care, always allow inlining.  */
9669   if (caller == dont_care)
9670     return true;
9671
9672   /* Otherwise, allow inlining if either the callee and caller values
9673      agree, or if the callee is using the default value.  */
9674   return (callee == caller || callee == def);
9675 }
9676
9677 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9678    to inline CALLEE into CALLER based on target-specific info.
9679    Make sure that the caller and callee have compatible architectural
9680    features.  Then go through the other possible target attributes
9681    and see if they can block inlining.  Try not to reject always_inline
9682    callees unless they are incompatible architecturally.  */
9683
9684 static bool
9685 aarch64_can_inline_p (tree caller, tree callee)
9686 {
9687   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9688   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9689
9690   /* If callee has no option attributes, then it is ok to inline.  */
9691   if (!callee_tree)
9692     return true;
9693
9694   struct cl_target_option *caller_opts
9695         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9696                                            : target_option_default_node);
9697
9698   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9699
9700
9701   /* Callee's ISA flags should be a subset of the caller's.  */
9702   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9703        != callee_opts->x_aarch64_isa_flags)
9704     return false;
9705
9706   /* Allow non-strict aligned functions inlining into strict
9707      aligned ones.  */
9708   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9709        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9710       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9711            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9712     return false;
9713
9714   bool always_inline = lookup_attribute ("always_inline",
9715                                           DECL_ATTRIBUTES (callee));
9716
9717   /* If the architectural features match up and the callee is always_inline
9718      then the other attributes don't matter.  */
9719   if (always_inline)
9720     return true;
9721
9722   if (caller_opts->x_aarch64_cmodel_var
9723       != callee_opts->x_aarch64_cmodel_var)
9724     return false;
9725
9726   if (caller_opts->x_aarch64_tls_dialect
9727       != callee_opts->x_aarch64_tls_dialect)
9728     return false;
9729
9730   /* Honour explicit requests to workaround errata.  */
9731   if (!aarch64_tribools_ok_for_inlining_p (
9732           caller_opts->x_aarch64_fix_a53_err835769,
9733           callee_opts->x_aarch64_fix_a53_err835769,
9734           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9735     return false;
9736
9737   if (!aarch64_tribools_ok_for_inlining_p (
9738           caller_opts->x_aarch64_fix_a53_err843419,
9739           callee_opts->x_aarch64_fix_a53_err843419,
9740           2, TARGET_FIX_ERR_A53_843419))
9741     return false;
9742
9743   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9744      caller and calle and they don't match up, reject inlining.  */
9745   if (!aarch64_tribools_ok_for_inlining_p (
9746           caller_opts->x_flag_omit_leaf_frame_pointer,
9747           callee_opts->x_flag_omit_leaf_frame_pointer,
9748           2, 1))
9749     return false;
9750
9751   /* If the callee has specific tuning overrides, respect them.  */
9752   if (callee_opts->x_aarch64_override_tune_string != NULL
9753       && caller_opts->x_aarch64_override_tune_string == NULL)
9754     return false;
9755
9756   /* If the user specified tuning override strings for the
9757      caller and callee and they don't match up, reject inlining.
9758      We just do a string compare here, we don't analyze the meaning
9759      of the string, as it would be too costly for little gain.  */
9760   if (callee_opts->x_aarch64_override_tune_string
9761       && caller_opts->x_aarch64_override_tune_string
9762       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9763                   caller_opts->x_aarch64_override_tune_string) != 0))
9764     return false;
9765
9766   return true;
9767 }
9768
9769 /* Return true if SYMBOL_REF X binds locally.  */
9770
9771 static bool
9772 aarch64_symbol_binds_local_p (const_rtx x)
9773 {
9774   return (SYMBOL_REF_DECL (x)
9775           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9776           : SYMBOL_REF_LOCAL_P (x));
9777 }
9778
9779 /* Return true if SYMBOL_REF X is thread local */
9780 static bool
9781 aarch64_tls_symbol_p (rtx x)
9782 {
9783   if (! TARGET_HAVE_TLS)
9784     return false;
9785
9786   if (GET_CODE (x) != SYMBOL_REF)
9787     return false;
9788
9789   return SYMBOL_REF_TLS_MODEL (x) != 0;
9790 }
9791
9792 /* Classify a TLS symbol into one of the TLS kinds.  */
9793 enum aarch64_symbol_type
9794 aarch64_classify_tls_symbol (rtx x)
9795 {
9796   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9797
9798   switch (tls_kind)
9799     {
9800     case TLS_MODEL_GLOBAL_DYNAMIC:
9801     case TLS_MODEL_LOCAL_DYNAMIC:
9802       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9803
9804     case TLS_MODEL_INITIAL_EXEC:
9805       switch (aarch64_cmodel)
9806         {
9807         case AARCH64_CMODEL_TINY:
9808         case AARCH64_CMODEL_TINY_PIC:
9809           return SYMBOL_TINY_TLSIE;
9810         default:
9811           return SYMBOL_SMALL_TLSIE;
9812         }
9813
9814     case TLS_MODEL_LOCAL_EXEC:
9815       if (aarch64_tls_size == 12)
9816         return SYMBOL_TLSLE12;
9817       else if (aarch64_tls_size == 24)
9818         return SYMBOL_TLSLE24;
9819       else if (aarch64_tls_size == 32)
9820         return SYMBOL_TLSLE32;
9821       else if (aarch64_tls_size == 48)
9822         return SYMBOL_TLSLE48;
9823       else
9824         gcc_unreachable ();
9825
9826     case TLS_MODEL_EMULATED:
9827     case TLS_MODEL_NONE:
9828       return SYMBOL_FORCE_TO_MEM;
9829
9830     default:
9831       gcc_unreachable ();
9832     }
9833 }
9834
9835 /* Return the method that should be used to access SYMBOL_REF or
9836    LABEL_REF X.  */
9837
9838 enum aarch64_symbol_type
9839 aarch64_classify_symbol (rtx x, rtx offset)
9840 {
9841   if (GET_CODE (x) == LABEL_REF)
9842     {
9843       switch (aarch64_cmodel)
9844         {
9845         case AARCH64_CMODEL_LARGE:
9846           return SYMBOL_FORCE_TO_MEM;
9847
9848         case AARCH64_CMODEL_TINY_PIC:
9849         case AARCH64_CMODEL_TINY:
9850           return SYMBOL_TINY_ABSOLUTE;
9851
9852         case AARCH64_CMODEL_SMALL_SPIC:
9853         case AARCH64_CMODEL_SMALL_PIC:
9854         case AARCH64_CMODEL_SMALL:
9855           return SYMBOL_SMALL_ABSOLUTE;
9856
9857         default:
9858           gcc_unreachable ();
9859         }
9860     }
9861
9862   if (GET_CODE (x) == SYMBOL_REF)
9863     {
9864       if (aarch64_tls_symbol_p (x))
9865         return aarch64_classify_tls_symbol (x);
9866
9867       switch (aarch64_cmodel)
9868         {
9869         case AARCH64_CMODEL_TINY:
9870           /* When we retrieve symbol + offset address, we have to make sure
9871              the offset does not cause overflow of the final address.  But
9872              we have no way of knowing the address of symbol at compile time
9873              so we can't accurately say if the distance between the PC and
9874              symbol + offset is outside the addressible range of +/-1M in the
9875              TINY code model.  So we rely on images not being greater than
9876              1M and cap the offset at 1M and anything beyond 1M will have to
9877              be loaded using an alternative mechanism.  Furthermore if the
9878              symbol is a weak reference to something that isn't known to
9879              resolve to a symbol in this module, then force to memory.  */
9880           if ((SYMBOL_REF_WEAK (x)
9881                && !aarch64_symbol_binds_local_p (x))
9882               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9883             return SYMBOL_FORCE_TO_MEM;
9884           return SYMBOL_TINY_ABSOLUTE;
9885
9886         case AARCH64_CMODEL_SMALL:
9887           /* Same reasoning as the tiny code model, but the offset cap here is
9888              4G.  */
9889           if ((SYMBOL_REF_WEAK (x)
9890                && !aarch64_symbol_binds_local_p (x))
9891               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9892                             HOST_WIDE_INT_C (4294967264)))
9893             return SYMBOL_FORCE_TO_MEM;
9894           return SYMBOL_SMALL_ABSOLUTE;
9895
9896         case AARCH64_CMODEL_TINY_PIC:
9897           if (!aarch64_symbol_binds_local_p (x))
9898             return SYMBOL_TINY_GOT;
9899           return SYMBOL_TINY_ABSOLUTE;
9900
9901         case AARCH64_CMODEL_SMALL_SPIC:
9902         case AARCH64_CMODEL_SMALL_PIC:
9903           if (!aarch64_symbol_binds_local_p (x))
9904             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9905                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9906           return SYMBOL_SMALL_ABSOLUTE;
9907
9908         case AARCH64_CMODEL_LARGE:
9909           /* This is alright even in PIC code as the constant
9910              pool reference is always PC relative and within
9911              the same translation unit.  */
9912           if (CONSTANT_POOL_ADDRESS_P (x))
9913             return SYMBOL_SMALL_ABSOLUTE;
9914           else
9915             return SYMBOL_FORCE_TO_MEM;
9916
9917         default:
9918           gcc_unreachable ();
9919         }
9920     }
9921
9922   /* By default push everything into the constant pool.  */
9923   return SYMBOL_FORCE_TO_MEM;
9924 }
9925
9926 bool
9927 aarch64_constant_address_p (rtx x)
9928 {
9929   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9930 }
9931
9932 bool
9933 aarch64_legitimate_pic_operand_p (rtx x)
9934 {
9935   if (GET_CODE (x) == SYMBOL_REF
9936       || (GET_CODE (x) == CONST
9937           && GET_CODE (XEXP (x, 0)) == PLUS
9938           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9939      return false;
9940
9941   return true;
9942 }
9943
9944 /* Return true if X holds either a quarter-precision or
9945      floating-point +0.0 constant.  */
9946 static bool
9947 aarch64_valid_floating_const (machine_mode mode, rtx x)
9948 {
9949   if (!CONST_DOUBLE_P (x))
9950     return false;
9951
9952   if (aarch64_float_const_zero_rtx_p (x))
9953     return true;
9954
9955   /* We only handle moving 0.0 to a TFmode register.  */
9956   if (!(mode == SFmode || mode == DFmode))
9957     return false;
9958
9959   return aarch64_float_const_representable_p (x);
9960 }
9961
9962 static bool
9963 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9964 {
9965   /* Do not allow vector struct mode constants.  We could support
9966      0 and -1 easily, but they need support in aarch64-simd.md.  */
9967   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9968     return false;
9969
9970   /* This could probably go away because
9971      we now decompose CONST_INTs according to expand_mov_immediate.  */
9972   if ((GET_CODE (x) == CONST_VECTOR
9973        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9974       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9975         return !targetm.cannot_force_const_mem (mode, x);
9976
9977   if (GET_CODE (x) == HIGH
9978       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9979     return true;
9980
9981   return aarch64_constant_address_p (x);
9982 }
9983
9984 rtx
9985 aarch64_load_tp (rtx target)
9986 {
9987   if (!target
9988       || GET_MODE (target) != Pmode
9989       || !register_operand (target, Pmode))
9990     target = gen_reg_rtx (Pmode);
9991
9992   /* Can return in any reg.  */
9993   emit_insn (gen_aarch64_load_tp_hard (target));
9994   return target;
9995 }
9996
9997 /* On AAPCS systems, this is the "struct __va_list".  */
9998 static GTY(()) tree va_list_type;
9999
10000 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10001    Return the type to use as __builtin_va_list.
10002
10003    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10004
10005    struct __va_list
10006    {
10007      void *__stack;
10008      void *__gr_top;
10009      void *__vr_top;
10010      int   __gr_offs;
10011      int   __vr_offs;
10012    };  */
10013
10014 static tree
10015 aarch64_build_builtin_va_list (void)
10016 {
10017   tree va_list_name;
10018   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10019
10020   /* Create the type.  */
10021   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10022   /* Give it the required name.  */
10023   va_list_name = build_decl (BUILTINS_LOCATION,
10024                              TYPE_DECL,
10025                              get_identifier ("__va_list"),
10026                              va_list_type);
10027   DECL_ARTIFICIAL (va_list_name) = 1;
10028   TYPE_NAME (va_list_type) = va_list_name;
10029   TYPE_STUB_DECL (va_list_type) = va_list_name;
10030
10031   /* Create the fields.  */
10032   f_stack = build_decl (BUILTINS_LOCATION,
10033                         FIELD_DECL, get_identifier ("__stack"),
10034                         ptr_type_node);
10035   f_grtop = build_decl (BUILTINS_LOCATION,
10036                         FIELD_DECL, get_identifier ("__gr_top"),
10037                         ptr_type_node);
10038   f_vrtop = build_decl (BUILTINS_LOCATION,
10039                         FIELD_DECL, get_identifier ("__vr_top"),
10040                         ptr_type_node);
10041   f_groff = build_decl (BUILTINS_LOCATION,
10042                         FIELD_DECL, get_identifier ("__gr_offs"),
10043                         integer_type_node);
10044   f_vroff = build_decl (BUILTINS_LOCATION,
10045                         FIELD_DECL, get_identifier ("__vr_offs"),
10046                         integer_type_node);
10047
10048   /* Tell tree-stdarg pass about our internal offset fields.
10049      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10050      purpose to identify whether the code is updating va_list internal
10051      offset fields through irregular way.  */
10052   va_list_gpr_counter_field = f_groff;
10053   va_list_fpr_counter_field = f_vroff;
10054
10055   DECL_ARTIFICIAL (f_stack) = 1;
10056   DECL_ARTIFICIAL (f_grtop) = 1;
10057   DECL_ARTIFICIAL (f_vrtop) = 1;
10058   DECL_ARTIFICIAL (f_groff) = 1;
10059   DECL_ARTIFICIAL (f_vroff) = 1;
10060
10061   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10062   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10063   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10064   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10065   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10066
10067   TYPE_FIELDS (va_list_type) = f_stack;
10068   DECL_CHAIN (f_stack) = f_grtop;
10069   DECL_CHAIN (f_grtop) = f_vrtop;
10070   DECL_CHAIN (f_vrtop) = f_groff;
10071   DECL_CHAIN (f_groff) = f_vroff;
10072
10073   /* Compute its layout.  */
10074   layout_type (va_list_type);
10075
10076   return va_list_type;
10077 }
10078
10079 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10080 static void
10081 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10082 {
10083   const CUMULATIVE_ARGS *cum;
10084   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10085   tree stack, grtop, vrtop, groff, vroff;
10086   tree t;
10087   int gr_save_area_size = cfun->va_list_gpr_size;
10088   int vr_save_area_size = cfun->va_list_fpr_size;
10089   int vr_offset;
10090
10091   cum = &crtl->args.info;
10092   if (cfun->va_list_gpr_size)
10093     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10094                              cfun->va_list_gpr_size);
10095   if (cfun->va_list_fpr_size)
10096     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10097                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10098
10099   if (!TARGET_FLOAT)
10100     {
10101       gcc_assert (cum->aapcs_nvrn == 0);
10102       vr_save_area_size = 0;
10103     }
10104
10105   f_stack = TYPE_FIELDS (va_list_type_node);
10106   f_grtop = DECL_CHAIN (f_stack);
10107   f_vrtop = DECL_CHAIN (f_grtop);
10108   f_groff = DECL_CHAIN (f_vrtop);
10109   f_vroff = DECL_CHAIN (f_groff);
10110
10111   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10112                   NULL_TREE);
10113   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10114                   NULL_TREE);
10115   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10116                   NULL_TREE);
10117   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10118                   NULL_TREE);
10119   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10120                   NULL_TREE);
10121
10122   /* Emit code to initialize STACK, which points to the next varargs stack
10123      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10124      by named arguments.  STACK is 8-byte aligned.  */
10125   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10126   if (cum->aapcs_stack_size > 0)
10127     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10128   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10129   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10130
10131   /* Emit code to initialize GRTOP, the top of the GR save area.
10132      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10133   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10134   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10135   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10136
10137   /* Emit code to initialize VRTOP, the top of the VR save area.
10138      This address is gr_save_area_bytes below GRTOP, rounded
10139      down to the next 16-byte boundary.  */
10140   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10141   vr_offset = ROUND_UP (gr_save_area_size,
10142                         STACK_BOUNDARY / BITS_PER_UNIT);
10143
10144   if (vr_offset)
10145     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10146   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10147   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10148
10149   /* Emit code to initialize GROFF, the offset from GRTOP of the
10150      next GPR argument.  */
10151   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10152               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10153   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10154
10155   /* Likewise emit code to initialize VROFF, the offset from FTOP
10156      of the next VR argument.  */
10157   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10158               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10159   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10160 }
10161
10162 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10163
10164 static tree
10165 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10166                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10167 {
10168   tree addr;
10169   bool indirect_p;
10170   bool is_ha;           /* is HFA or HVA.  */
10171   bool dw_align;        /* double-word align.  */
10172   machine_mode ag_mode = VOIDmode;
10173   int nregs;
10174   machine_mode mode;
10175
10176   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10177   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10178   HOST_WIDE_INT size, rsize, adjust, align;
10179   tree t, u, cond1, cond2;
10180
10181   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10182   if (indirect_p)
10183     type = build_pointer_type (type);
10184
10185   mode = TYPE_MODE (type);
10186
10187   f_stack = TYPE_FIELDS (va_list_type_node);
10188   f_grtop = DECL_CHAIN (f_stack);
10189   f_vrtop = DECL_CHAIN (f_grtop);
10190   f_groff = DECL_CHAIN (f_vrtop);
10191   f_vroff = DECL_CHAIN (f_groff);
10192
10193   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10194                   f_stack, NULL_TREE);
10195   size = int_size_in_bytes (type);
10196   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10197
10198   dw_align = false;
10199   adjust = 0;
10200   if (aarch64_vfp_is_call_or_return_candidate (mode,
10201                                                type,
10202                                                &ag_mode,
10203                                                &nregs,
10204                                                &is_ha))
10205     {
10206       /* TYPE passed in fp/simd registers.  */
10207       if (!TARGET_FLOAT)
10208         aarch64_err_no_fpadvsimd (mode, "varargs");
10209
10210       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10211                       unshare_expr (valist), f_vrtop, NULL_TREE);
10212       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10213                       unshare_expr (valist), f_vroff, NULL_TREE);
10214
10215       rsize = nregs * UNITS_PER_VREG;
10216
10217       if (is_ha)
10218         {
10219           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10220             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10221         }
10222       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10223                && size < UNITS_PER_VREG)
10224         {
10225           adjust = UNITS_PER_VREG - size;
10226         }
10227     }
10228   else
10229     {
10230       /* TYPE passed in general registers.  */
10231       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10232                       unshare_expr (valist), f_grtop, NULL_TREE);
10233       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10234                       unshare_expr (valist), f_groff, NULL_TREE);
10235       rsize = ROUND_UP (size, UNITS_PER_WORD);
10236       nregs = rsize / UNITS_PER_WORD;
10237
10238       if (align > 8)
10239         dw_align = true;
10240
10241       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10242           && size < UNITS_PER_WORD)
10243         {
10244           adjust = UNITS_PER_WORD  - size;
10245         }
10246     }
10247
10248   /* Get a local temporary for the field value.  */
10249   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10250
10251   /* Emit code to branch if off >= 0.  */
10252   t = build2 (GE_EXPR, boolean_type_node, off,
10253               build_int_cst (TREE_TYPE (off), 0));
10254   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10255
10256   if (dw_align)
10257     {
10258       /* Emit: offs = (offs + 15) & -16.  */
10259       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10260                   build_int_cst (TREE_TYPE (off), 15));
10261       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10262                   build_int_cst (TREE_TYPE (off), -16));
10263       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10264     }
10265   else
10266     roundup = NULL;
10267
10268   /* Update ap.__[g|v]r_offs  */
10269   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10270               build_int_cst (TREE_TYPE (off), rsize));
10271   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10272
10273   /* String up.  */
10274   if (roundup)
10275     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10276
10277   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10278   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10279               build_int_cst (TREE_TYPE (f_off), 0));
10280   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10281
10282   /* String up: make sure the assignment happens before the use.  */
10283   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10284   COND_EXPR_ELSE (cond1) = t;
10285
10286   /* Prepare the trees handling the argument that is passed on the stack;
10287      the top level node will store in ON_STACK.  */
10288   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10289   if (align > 8)
10290     {
10291       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10292       t = fold_convert (intDI_type_node, arg);
10293       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10294                   build_int_cst (TREE_TYPE (t), 15));
10295       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10296                   build_int_cst (TREE_TYPE (t), -16));
10297       t = fold_convert (TREE_TYPE (arg), t);
10298       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10299     }
10300   else
10301     roundup = NULL;
10302   /* Advance ap.__stack  */
10303   t = fold_convert (intDI_type_node, arg);
10304   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10305               build_int_cst (TREE_TYPE (t), size + 7));
10306   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10307               build_int_cst (TREE_TYPE (t), -8));
10308   t = fold_convert (TREE_TYPE (arg), t);
10309   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10310   /* String up roundup and advance.  */
10311   if (roundup)
10312     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10313   /* String up with arg */
10314   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10315   /* Big-endianness related address adjustment.  */
10316   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10317       && size < UNITS_PER_WORD)
10318   {
10319     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10320                 size_int (UNITS_PER_WORD - size));
10321     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10322   }
10323
10324   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10325   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10326
10327   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10328   t = off;
10329   if (adjust)
10330     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10331                 build_int_cst (TREE_TYPE (off), adjust));
10332
10333   t = fold_convert (sizetype, t);
10334   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10335
10336   if (is_ha)
10337     {
10338       /* type ha; // treat as "struct {ftype field[n];}"
10339          ... [computing offs]
10340          for (i = 0; i <nregs; ++i, offs += 16)
10341            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10342          return ha;  */
10343       int i;
10344       tree tmp_ha, field_t, field_ptr_t;
10345
10346       /* Declare a local variable.  */
10347       tmp_ha = create_tmp_var_raw (type, "ha");
10348       gimple_add_tmp_var (tmp_ha);
10349
10350       /* Establish the base type.  */
10351       switch (ag_mode)
10352         {
10353         case SFmode:
10354           field_t = float_type_node;
10355           field_ptr_t = float_ptr_type_node;
10356           break;
10357         case DFmode:
10358           field_t = double_type_node;
10359           field_ptr_t = double_ptr_type_node;
10360           break;
10361         case TFmode:
10362           field_t = long_double_type_node;
10363           field_ptr_t = long_double_ptr_type_node;
10364           break;
10365         case HFmode:
10366           field_t = aarch64_fp16_type_node;
10367           field_ptr_t = aarch64_fp16_ptr_type_node;
10368           break;
10369         case V2SImode:
10370         case V4SImode:
10371             {
10372               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10373               field_t = build_vector_type_for_mode (innertype, ag_mode);
10374               field_ptr_t = build_pointer_type (field_t);
10375             }
10376           break;
10377         default:
10378           gcc_assert (0);
10379         }
10380
10381       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10382       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10383       addr = t;
10384       t = fold_convert (field_ptr_t, addr);
10385       t = build2 (MODIFY_EXPR, field_t,
10386                   build1 (INDIRECT_REF, field_t, tmp_ha),
10387                   build1 (INDIRECT_REF, field_t, t));
10388
10389       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10390       for (i = 1; i < nregs; ++i)
10391         {
10392           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10393           u = fold_convert (field_ptr_t, addr);
10394           u = build2 (MODIFY_EXPR, field_t,
10395                       build2 (MEM_REF, field_t, tmp_ha,
10396                               build_int_cst (field_ptr_t,
10397                                              (i *
10398                                               int_size_in_bytes (field_t)))),
10399                       build1 (INDIRECT_REF, field_t, u));
10400           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10401         }
10402
10403       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10404       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10405     }
10406
10407   COND_EXPR_ELSE (cond2) = t;
10408   addr = fold_convert (build_pointer_type (type), cond1);
10409   addr = build_va_arg_indirect_ref (addr);
10410
10411   if (indirect_p)
10412     addr = build_va_arg_indirect_ref (addr);
10413
10414   return addr;
10415 }
10416
10417 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10418
10419 static void
10420 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10421                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10422                                 int no_rtl)
10423 {
10424   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10425   CUMULATIVE_ARGS local_cum;
10426   int gr_saved = cfun->va_list_gpr_size;
10427   int vr_saved = cfun->va_list_fpr_size;
10428
10429   /* The caller has advanced CUM up to, but not beyond, the last named
10430      argument.  Advance a local copy of CUM past the last "real" named
10431      argument, to find out how many registers are left over.  */
10432   local_cum = *cum;
10433   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10434
10435   /* Found out how many registers we need to save.
10436      Honor tree-stdvar analysis results.  */
10437   if (cfun->va_list_gpr_size)
10438     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10439                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10440   if (cfun->va_list_fpr_size)
10441     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10442                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10443
10444   if (!TARGET_FLOAT)
10445     {
10446       gcc_assert (local_cum.aapcs_nvrn == 0);
10447       vr_saved = 0;
10448     }
10449
10450   if (!no_rtl)
10451     {
10452       if (gr_saved > 0)
10453         {
10454           rtx ptr, mem;
10455
10456           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10457           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10458                                - gr_saved * UNITS_PER_WORD);
10459           mem = gen_frame_mem (BLKmode, ptr);
10460           set_mem_alias_set (mem, get_varargs_alias_set ());
10461
10462           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10463                                mem, gr_saved);
10464         }
10465       if (vr_saved > 0)
10466         {
10467           /* We can't use move_block_from_reg, because it will use
10468              the wrong mode, storing D regs only.  */
10469           machine_mode mode = TImode;
10470           int off, i, vr_start;
10471
10472           /* Set OFF to the offset from virtual_incoming_args_rtx of
10473              the first vector register.  The VR save area lies below
10474              the GR one, and is aligned to 16 bytes.  */
10475           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10476                            STACK_BOUNDARY / BITS_PER_UNIT);
10477           off -= vr_saved * UNITS_PER_VREG;
10478
10479           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10480           for (i = 0; i < vr_saved; ++i)
10481             {
10482               rtx ptr, mem;
10483
10484               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10485               mem = gen_frame_mem (mode, ptr);
10486               set_mem_alias_set (mem, get_varargs_alias_set ());
10487               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10488               off += UNITS_PER_VREG;
10489             }
10490         }
10491     }
10492
10493   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10494      any complication of having crtl->args.pretend_args_size changed.  */
10495   cfun->machine->frame.saved_varargs_size
10496     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10497                  STACK_BOUNDARY / BITS_PER_UNIT)
10498        + vr_saved * UNITS_PER_VREG);
10499 }
10500
10501 static void
10502 aarch64_conditional_register_usage (void)
10503 {
10504   int i;
10505   if (!TARGET_FLOAT)
10506     {
10507       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10508         {
10509           fixed_regs[i] = 1;
10510           call_used_regs[i] = 1;
10511         }
10512     }
10513 }
10514
10515 /* Walk down the type tree of TYPE counting consecutive base elements.
10516    If *MODEP is VOIDmode, then set it to the first valid floating point
10517    type.  If a non-floating point type is found, or if a floating point
10518    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10519    otherwise return the count in the sub-tree.  */
10520 static int
10521 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10522 {
10523   machine_mode mode;
10524   HOST_WIDE_INT size;
10525
10526   switch (TREE_CODE (type))
10527     {
10528     case REAL_TYPE:
10529       mode = TYPE_MODE (type);
10530       if (mode != DFmode && mode != SFmode
10531           && mode != TFmode && mode != HFmode)
10532         return -1;
10533
10534       if (*modep == VOIDmode)
10535         *modep = mode;
10536
10537       if (*modep == mode)
10538         return 1;
10539
10540       break;
10541
10542     case COMPLEX_TYPE:
10543       mode = TYPE_MODE (TREE_TYPE (type));
10544       if (mode != DFmode && mode != SFmode
10545           && mode != TFmode && mode != HFmode)
10546         return -1;
10547
10548       if (*modep == VOIDmode)
10549         *modep = mode;
10550
10551       if (*modep == mode)
10552         return 2;
10553
10554       break;
10555
10556     case VECTOR_TYPE:
10557       /* Use V2SImode and V4SImode as representatives of all 64-bit
10558          and 128-bit vector types.  */
10559       size = int_size_in_bytes (type);
10560       switch (size)
10561         {
10562         case 8:
10563           mode = V2SImode;
10564           break;
10565         case 16:
10566           mode = V4SImode;
10567           break;
10568         default:
10569           return -1;
10570         }
10571
10572       if (*modep == VOIDmode)
10573         *modep = mode;
10574
10575       /* Vector modes are considered to be opaque: two vectors are
10576          equivalent for the purposes of being homogeneous aggregates
10577          if they are the same size.  */
10578       if (*modep == mode)
10579         return 1;
10580
10581       break;
10582
10583     case ARRAY_TYPE:
10584       {
10585         int count;
10586         tree index = TYPE_DOMAIN (type);
10587
10588         /* Can't handle incomplete types nor sizes that are not
10589            fixed.  */
10590         if (!COMPLETE_TYPE_P (type)
10591             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10592           return -1;
10593
10594         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10595         if (count == -1
10596             || !index
10597             || !TYPE_MAX_VALUE (index)
10598             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10599             || !TYPE_MIN_VALUE (index)
10600             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10601             || count < 0)
10602           return -1;
10603
10604         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10605                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10606
10607         /* There must be no padding.  */
10608         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10609           return -1;
10610
10611         return count;
10612       }
10613
10614     case RECORD_TYPE:
10615       {
10616         int count = 0;
10617         int sub_count;
10618         tree field;
10619
10620         /* Can't handle incomplete types nor sizes that are not
10621            fixed.  */
10622         if (!COMPLETE_TYPE_P (type)
10623             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10624           return -1;
10625
10626         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10627           {
10628             if (TREE_CODE (field) != FIELD_DECL)
10629               continue;
10630
10631             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10632             if (sub_count < 0)
10633               return -1;
10634             count += sub_count;
10635           }
10636
10637         /* There must be no padding.  */
10638         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10639           return -1;
10640
10641         return count;
10642       }
10643
10644     case UNION_TYPE:
10645     case QUAL_UNION_TYPE:
10646       {
10647         /* These aren't very interesting except in a degenerate case.  */
10648         int count = 0;
10649         int sub_count;
10650         tree field;
10651
10652         /* Can't handle incomplete types nor sizes that are not
10653            fixed.  */
10654         if (!COMPLETE_TYPE_P (type)
10655             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10656           return -1;
10657
10658         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10659           {
10660             if (TREE_CODE (field) != FIELD_DECL)
10661               continue;
10662
10663             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10664             if (sub_count < 0)
10665               return -1;
10666             count = count > sub_count ? count : sub_count;
10667           }
10668
10669         /* There must be no padding.  */
10670         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10671           return -1;
10672
10673         return count;
10674       }
10675
10676     default:
10677       break;
10678     }
10679
10680   return -1;
10681 }
10682
10683 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10684    type as described in AAPCS64 \S 4.1.2.
10685
10686    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10687
10688 static bool
10689 aarch64_short_vector_p (const_tree type,
10690                         machine_mode mode)
10691 {
10692   HOST_WIDE_INT size = -1;
10693
10694   if (type && TREE_CODE (type) == VECTOR_TYPE)
10695     size = int_size_in_bytes (type);
10696   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10697             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10698     size = GET_MODE_SIZE (mode);
10699
10700   return (size == 8 || size == 16);
10701 }
10702
10703 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10704    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10705    array types.  The C99 floating-point complex types are also considered
10706    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10707    types, which are GCC extensions and out of the scope of AAPCS64, are
10708    treated as composite types here as well.
10709
10710    Note that MODE itself is not sufficient in determining whether a type
10711    is such a composite type or not.  This is because
10712    stor-layout.c:compute_record_mode may have already changed the MODE
10713    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10714    structure with only one field may have its MODE set to the mode of the
10715    field.  Also an integer mode whose size matches the size of the
10716    RECORD_TYPE type may be used to substitute the original mode
10717    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10718    solely relied on.  */
10719
10720 static bool
10721 aarch64_composite_type_p (const_tree type,
10722                           machine_mode mode)
10723 {
10724   if (aarch64_short_vector_p (type, mode))
10725     return false;
10726
10727   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10728     return true;
10729
10730   if (mode == BLKmode
10731       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10732       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10733     return true;
10734
10735   return false;
10736 }
10737
10738 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10739    shall be passed or returned in simd/fp register(s) (providing these
10740    parameter passing registers are available).
10741
10742    Upon successful return, *COUNT returns the number of needed registers,
10743    *BASE_MODE returns the mode of the individual register and when IS_HAF
10744    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10745    floating-point aggregate or a homogeneous short-vector aggregate.  */
10746
10747 static bool
10748 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10749                                          const_tree type,
10750                                          machine_mode *base_mode,
10751                                          int *count,
10752                                          bool *is_ha)
10753 {
10754   machine_mode new_mode = VOIDmode;
10755   bool composite_p = aarch64_composite_type_p (type, mode);
10756
10757   if (is_ha != NULL) *is_ha = false;
10758
10759   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10760       || aarch64_short_vector_p (type, mode))
10761     {
10762       *count = 1;
10763       new_mode = mode;
10764     }
10765   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10766     {
10767       if (is_ha != NULL) *is_ha = true;
10768       *count = 2;
10769       new_mode = GET_MODE_INNER (mode);
10770     }
10771   else if (type && composite_p)
10772     {
10773       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10774
10775       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10776         {
10777           if (is_ha != NULL) *is_ha = true;
10778           *count = ag_count;
10779         }
10780       else
10781         return false;
10782     }
10783   else
10784     return false;
10785
10786   *base_mode = new_mode;
10787   return true;
10788 }
10789
10790 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10791
10792 static rtx
10793 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10794                           int incoming ATTRIBUTE_UNUSED)
10795 {
10796   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10797 }
10798
10799 /* Implements target hook vector_mode_supported_p.  */
10800 static bool
10801 aarch64_vector_mode_supported_p (machine_mode mode)
10802 {
10803   if (TARGET_SIMD
10804       && (mode == V4SImode  || mode == V8HImode
10805           || mode == V16QImode || mode == V2DImode
10806           || mode == V2SImode  || mode == V4HImode
10807           || mode == V8QImode || mode == V2SFmode
10808           || mode == V4SFmode || mode == V2DFmode
10809           || mode == V4HFmode || mode == V8HFmode
10810           || mode == V1DFmode))
10811     return true;
10812
10813   return false;
10814 }
10815
10816 /* Return appropriate SIMD container
10817    for MODE within a vector of WIDTH bits.  */
10818 static machine_mode
10819 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10820 {
10821   gcc_assert (width == 64 || width == 128);
10822   if (TARGET_SIMD)
10823     {
10824       if (width == 128)
10825         switch (mode)
10826           {
10827           case DFmode:
10828             return V2DFmode;
10829           case SFmode:
10830             return V4SFmode;
10831           case SImode:
10832             return V4SImode;
10833           case HImode:
10834             return V8HImode;
10835           case QImode:
10836             return V16QImode;
10837           case DImode:
10838             return V2DImode;
10839           default:
10840             break;
10841           }
10842       else
10843         switch (mode)
10844           {
10845           case SFmode:
10846             return V2SFmode;
10847           case SImode:
10848             return V2SImode;
10849           case HImode:
10850             return V4HImode;
10851           case QImode:
10852             return V8QImode;
10853           default:
10854             break;
10855           }
10856     }
10857   return word_mode;
10858 }
10859
10860 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10861 static machine_mode
10862 aarch64_preferred_simd_mode (machine_mode mode)
10863 {
10864   return aarch64_simd_container_mode (mode, 128);
10865 }
10866
10867 /* Return the bitmask of possible vector sizes for the vectorizer
10868    to iterate over.  */
10869 static unsigned int
10870 aarch64_autovectorize_vector_sizes (void)
10871 {
10872   return (16 | 8);
10873 }
10874
10875 /* Implement TARGET_MANGLE_TYPE.  */
10876
10877 static const char *
10878 aarch64_mangle_type (const_tree type)
10879 {
10880   /* The AArch64 ABI documents say that "__va_list" has to be
10881      managled as if it is in the "std" namespace.  */
10882   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10883     return "St9__va_list";
10884
10885   /* Half-precision float.  */
10886   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10887     return "Dh";
10888
10889   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10890      builtin types.  */
10891   if (TYPE_NAME (type) != NULL)
10892     return aarch64_mangle_builtin_type (type);
10893
10894   /* Use the default mangling.  */
10895   return NULL;
10896 }
10897
10898
10899 /* Return true if the rtx_insn contains a MEM RTX somewhere
10900    in it.  */
10901
10902 static bool
10903 has_memory_op (rtx_insn *mem_insn)
10904 {
10905   subrtx_iterator::array_type array;
10906   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10907     if (MEM_P (*iter))
10908       return true;
10909
10910   return false;
10911 }
10912
10913 /* Find the first rtx_insn before insn that will generate an assembly
10914    instruction.  */
10915
10916 static rtx_insn *
10917 aarch64_prev_real_insn (rtx_insn *insn)
10918 {
10919   if (!insn)
10920     return NULL;
10921
10922   do
10923     {
10924       insn = prev_real_insn (insn);
10925     }
10926   while (insn && recog_memoized (insn) < 0);
10927
10928   return insn;
10929 }
10930
10931 static bool
10932 is_madd_op (enum attr_type t1)
10933 {
10934   unsigned int i;
10935   /* A number of these may be AArch32 only.  */
10936   enum attr_type mlatypes[] = {
10937     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10938     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10939     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10940   };
10941
10942   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10943     {
10944       if (t1 == mlatypes[i])
10945         return true;
10946     }
10947
10948   return false;
10949 }
10950
10951 /* Check if there is a register dependency between a load and the insn
10952    for which we hold recog_data.  */
10953
10954 static bool
10955 dep_between_memop_and_curr (rtx memop)
10956 {
10957   rtx load_reg;
10958   int opno;
10959
10960   gcc_assert (GET_CODE (memop) == SET);
10961
10962   if (!REG_P (SET_DEST (memop)))
10963     return false;
10964
10965   load_reg = SET_DEST (memop);
10966   for (opno = 1; opno < recog_data.n_operands; opno++)
10967     {
10968       rtx operand = recog_data.operand[opno];
10969       if (REG_P (operand)
10970           && reg_overlap_mentioned_p (load_reg, operand))
10971         return true;
10972
10973     }
10974   return false;
10975 }
10976
10977
10978 /* When working around the Cortex-A53 erratum 835769,
10979    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10980    instruction and has a preceding memory instruction such that a NOP
10981    should be inserted between them.  */
10982
10983 bool
10984 aarch64_madd_needs_nop (rtx_insn* insn)
10985 {
10986   enum attr_type attr_type;
10987   rtx_insn *prev;
10988   rtx body;
10989
10990   if (!TARGET_FIX_ERR_A53_835769)
10991     return false;
10992
10993   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10994     return false;
10995
10996   attr_type = get_attr_type (insn);
10997   if (!is_madd_op (attr_type))
10998     return false;
10999
11000   prev = aarch64_prev_real_insn (insn);
11001   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11002      Restore recog state to INSN to avoid state corruption.  */
11003   extract_constrain_insn_cached (insn);
11004
11005   if (!prev || !has_memory_op (prev))
11006     return false;
11007
11008   body = single_set (prev);
11009
11010   /* If the previous insn is a memory op and there is no dependency between
11011      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11012      have a complex memory operation, probably a load/store pair.
11013      Be conservative for now and emit a NOP.  */
11014   if (GET_MODE (recog_data.operand[0]) == DImode
11015       && (!body || !dep_between_memop_and_curr (body)))
11016     return true;
11017
11018   return false;
11019
11020 }
11021
11022
11023 /* Implement FINAL_PRESCAN_INSN.  */
11024
11025 void
11026 aarch64_final_prescan_insn (rtx_insn *insn)
11027 {
11028   if (aarch64_madd_needs_nop (insn))
11029     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11030 }
11031
11032
11033 /* Return the equivalent letter for size.  */
11034 static char
11035 sizetochar (int size)
11036 {
11037   switch (size)
11038     {
11039     case 64: return 'd';
11040     case 32: return 's';
11041     case 16: return 'h';
11042     case 8 : return 'b';
11043     default: gcc_unreachable ();
11044     }
11045 }
11046
11047 /* Return true iff x is a uniform vector of floating-point
11048    constants, and the constant can be represented in
11049    quarter-precision form.  Note, as aarch64_float_const_representable
11050    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11051 static bool
11052 aarch64_vect_float_const_representable_p (rtx x)
11053 {
11054   rtx elt;
11055   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11056           && const_vec_duplicate_p (x, &elt)
11057           && aarch64_float_const_representable_p (elt));
11058 }
11059
11060 /* Return true for valid and false for invalid.  */
11061 bool
11062 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11063                               struct simd_immediate_info *info)
11064 {
11065 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11066   matches = 1;                                          \
11067   for (i = 0; i < idx; i += (STRIDE))                   \
11068     if (!(TEST))                                        \
11069       matches = 0;                                      \
11070   if (matches)                                          \
11071     {                                                   \
11072       immtype = (CLASS);                                \
11073       elsize = (ELSIZE);                                \
11074       eshift = (SHIFT);                                 \
11075       emvn = (NEG);                                     \
11076       break;                                            \
11077     }
11078
11079   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11080   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11081   unsigned char bytes[16];
11082   int immtype = -1, matches;
11083   unsigned int invmask = inverse ? 0xff : 0;
11084   int eshift, emvn;
11085
11086   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11087     {
11088       if (! (aarch64_simd_imm_zero_p (op, mode)
11089              || aarch64_vect_float_const_representable_p (op)))
11090         return false;
11091
11092       if (info)
11093         {
11094           info->value = CONST_VECTOR_ELT (op, 0);
11095           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11096           info->mvn = false;
11097           info->shift = 0;
11098         }
11099
11100       return true;
11101     }
11102
11103   /* Splat vector constant out into a byte vector.  */
11104   for (i = 0; i < n_elts; i++)
11105     {
11106       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11107          it must be laid out in the vector register in reverse order.  */
11108       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11109       unsigned HOST_WIDE_INT elpart;
11110
11111       gcc_assert (CONST_INT_P (el));
11112       elpart = INTVAL (el);
11113
11114       for (unsigned int byte = 0; byte < innersize; byte++)
11115         {
11116           bytes[idx++] = (elpart & 0xff) ^ invmask;
11117           elpart >>= BITS_PER_UNIT;
11118         }
11119
11120     }
11121
11122   /* Sanity check.  */
11123   gcc_assert (idx == GET_MODE_SIZE (mode));
11124
11125   do
11126     {
11127       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11128              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11129
11130       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11131              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11132
11133       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11134              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11135
11136       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11137              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11138
11139       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11140
11141       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11142
11143       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11144              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11145
11146       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11147              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11148
11149       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11150              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11151
11152       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11153              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11154
11155       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11156
11157       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11158
11159       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11160              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11161
11162       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11163              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11164
11165       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11166              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11167
11168       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11169              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11170
11171       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11172
11173       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11174              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11175     }
11176   while (0);
11177
11178   if (immtype == -1)
11179     return false;
11180
11181   if (info)
11182     {
11183       info->element_width = elsize;
11184       info->mvn = emvn != 0;
11185       info->shift = eshift;
11186
11187       unsigned HOST_WIDE_INT imm = 0;
11188
11189       if (immtype >= 12 && immtype <= 15)
11190         info->msl = true;
11191
11192       /* Un-invert bytes of recognized vector, if necessary.  */
11193       if (invmask != 0)
11194         for (i = 0; i < idx; i++)
11195           bytes[i] ^= invmask;
11196
11197       if (immtype == 17)
11198         {
11199           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11200           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11201
11202           for (i = 0; i < 8; i++)
11203             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11204               << (i * BITS_PER_UNIT);
11205
11206
11207           info->value = GEN_INT (imm);
11208         }
11209       else
11210         {
11211           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11212             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11213
11214           /* Construct 'abcdefgh' because the assembler cannot handle
11215              generic constants.  */
11216           if (info->mvn)
11217             imm = ~imm;
11218           imm = (imm >> info->shift) & 0xff;
11219           info->value = GEN_INT (imm);
11220         }
11221     }
11222
11223   return true;
11224 #undef CHECK
11225 }
11226
11227 /* Check of immediate shift constants are within range.  */
11228 bool
11229 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11230 {
11231   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11232   if (left)
11233     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11234   else
11235     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11236 }
11237
11238 /* Return true if X is a uniform vector where all elements
11239    are either the floating-point constant 0.0 or the
11240    integer constant 0.  */
11241 bool
11242 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11243 {
11244   return x == CONST0_RTX (mode);
11245 }
11246
11247
11248 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11249    operation of width WIDTH at bit position POS.  */
11250
11251 rtx
11252 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11253 {
11254   gcc_assert (CONST_INT_P (width));
11255   gcc_assert (CONST_INT_P (pos));
11256
11257   unsigned HOST_WIDE_INT mask
11258     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11259   return GEN_INT (mask << UINTVAL (pos));
11260 }
11261
11262 bool
11263 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11264 {
11265   HOST_WIDE_INT imm = INTVAL (x);
11266   int i;
11267
11268   for (i = 0; i < 8; i++)
11269     {
11270       unsigned int byte = imm & 0xff;
11271       if (byte != 0xff && byte != 0)
11272        return false;
11273       imm >>= 8;
11274     }
11275
11276   return true;
11277 }
11278
11279 bool
11280 aarch64_mov_operand_p (rtx x, machine_mode mode)
11281 {
11282   if (GET_CODE (x) == HIGH
11283       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11284     return true;
11285
11286   if (CONST_INT_P (x))
11287     return true;
11288
11289   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11290     return true;
11291
11292   return aarch64_classify_symbolic_expression (x)
11293     == SYMBOL_TINY_ABSOLUTE;
11294 }
11295
11296 /* Return a const_int vector of VAL.  */
11297 rtx
11298 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11299 {
11300   int nunits = GET_MODE_NUNITS (mode);
11301   rtvec v = rtvec_alloc (nunits);
11302   int i;
11303
11304   rtx cache = GEN_INT (val);
11305
11306   for (i=0; i < nunits; i++)
11307     RTVEC_ELT (v, i) = cache;
11308
11309   return gen_rtx_CONST_VECTOR (mode, v);
11310 }
11311
11312 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11313
11314 bool
11315 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11316 {
11317   machine_mode vmode;
11318
11319   gcc_assert (!VECTOR_MODE_P (mode));
11320   vmode = aarch64_preferred_simd_mode (mode);
11321   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11322   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11323 }
11324
11325 /* Construct and return a PARALLEL RTX vector with elements numbering the
11326    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11327    the vector - from the perspective of the architecture.  This does not
11328    line up with GCC's perspective on lane numbers, so we end up with
11329    different masks depending on our target endian-ness.  The diagram
11330    below may help.  We must draw the distinction when building masks
11331    which select one half of the vector.  An instruction selecting
11332    architectural low-lanes for a big-endian target, must be described using
11333    a mask selecting GCC high-lanes.
11334
11335                  Big-Endian             Little-Endian
11336
11337 GCC             0   1   2   3           3   2   1   0
11338               | x | x | x | x |       | x | x | x | x |
11339 Architecture    3   2   1   0           3   2   1   0
11340
11341 Low Mask:         { 2, 3 }                { 0, 1 }
11342 High Mask:        { 0, 1 }                { 2, 3 }
11343 */
11344
11345 rtx
11346 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11347 {
11348   int nunits = GET_MODE_NUNITS (mode);
11349   rtvec v = rtvec_alloc (nunits / 2);
11350   int high_base = nunits / 2;
11351   int low_base = 0;
11352   int base;
11353   rtx t1;
11354   int i;
11355
11356   if (BYTES_BIG_ENDIAN)
11357     base = high ? low_base : high_base;
11358   else
11359     base = high ? high_base : low_base;
11360
11361   for (i = 0; i < nunits / 2; i++)
11362     RTVEC_ELT (v, i) = GEN_INT (base + i);
11363
11364   t1 = gen_rtx_PARALLEL (mode, v);
11365   return t1;
11366 }
11367
11368 /* Check OP for validity as a PARALLEL RTX vector with elements
11369    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11370    from the perspective of the architecture.  See the diagram above
11371    aarch64_simd_vect_par_cnst_half for more details.  */
11372
11373 bool
11374 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11375                                        bool high)
11376 {
11377   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11378   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11379   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11380   int i = 0;
11381
11382   if (!VECTOR_MODE_P (mode))
11383     return false;
11384
11385   if (count_op != count_ideal)
11386     return false;
11387
11388   for (i = 0; i < count_ideal; i++)
11389     {
11390       rtx elt_op = XVECEXP (op, 0, i);
11391       rtx elt_ideal = XVECEXP (ideal, 0, i);
11392
11393       if (!CONST_INT_P (elt_op)
11394           || INTVAL (elt_ideal) != INTVAL (elt_op))
11395         return false;
11396     }
11397   return true;
11398 }
11399
11400 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11401    HIGH (exclusive).  */
11402 void
11403 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11404                           const_tree exp)
11405 {
11406   HOST_WIDE_INT lane;
11407   gcc_assert (CONST_INT_P (operand));
11408   lane = INTVAL (operand);
11409
11410   if (lane < low || lane >= high)
11411   {
11412     if (exp)
11413       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11414     else
11415       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11416   }
11417 }
11418
11419 /* Return TRUE if OP is a valid vector addressing mode.  */
11420 bool
11421 aarch64_simd_mem_operand_p (rtx op)
11422 {
11423   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11424                         || REG_P (XEXP (op, 0)));
11425 }
11426
11427 /* Emit a register copy from operand to operand, taking care not to
11428    early-clobber source registers in the process.
11429
11430    COUNT is the number of components into which the copy needs to be
11431    decomposed.  */
11432 void
11433 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11434                                 unsigned int count)
11435 {
11436   unsigned int i;
11437   int rdest = REGNO (operands[0]);
11438   int rsrc = REGNO (operands[1]);
11439
11440   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11441       || rdest < rsrc)
11442     for (i = 0; i < count; i++)
11443       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11444                       gen_rtx_REG (mode, rsrc + i));
11445   else
11446     for (i = 0; i < count; i++)
11447       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11448                       gen_rtx_REG (mode, rsrc + count - i - 1));
11449 }
11450
11451 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11452    one of VSTRUCT modes: OI, CI, or XI.  */
11453 int
11454 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11455 {
11456   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11457 }
11458
11459 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11460    alignment of a vector to 128 bits.  */
11461 static HOST_WIDE_INT
11462 aarch64_simd_vector_alignment (const_tree type)
11463 {
11464   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11465   return MIN (align, 128);
11466 }
11467
11468 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11469 static bool
11470 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11471 {
11472   if (is_packed)
11473     return false;
11474
11475   /* We guarantee alignment for vectors up to 128-bits.  */
11476   if (tree_int_cst_compare (TYPE_SIZE (type),
11477                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11478     return false;
11479
11480   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11481   return true;
11482 }
11483
11484 /* Return true if the vector misalignment factor is supported by the
11485    target.  */
11486 static bool
11487 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11488                                              const_tree type, int misalignment,
11489                                              bool is_packed)
11490 {
11491   if (TARGET_SIMD && STRICT_ALIGNMENT)
11492     {
11493       /* Return if movmisalign pattern is not supported for this mode.  */
11494       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11495         return false;
11496
11497       if (misalignment == -1)
11498         {
11499           /* Misalignment factor is unknown at compile time but we know
11500              it's word aligned.  */
11501           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11502             {
11503               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11504
11505               if (element_size != 64)
11506                 return true;
11507             }
11508           return false;
11509         }
11510     }
11511   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11512                                                       is_packed);
11513 }
11514
11515 /* If VALS is a vector constant that can be loaded into a register
11516    using DUP, generate instructions to do so and return an RTX to
11517    assign to the register.  Otherwise return NULL_RTX.  */
11518 static rtx
11519 aarch64_simd_dup_constant (rtx vals)
11520 {
11521   machine_mode mode = GET_MODE (vals);
11522   machine_mode inner_mode = GET_MODE_INNER (mode);
11523   rtx x;
11524
11525   if (!const_vec_duplicate_p (vals, &x))
11526     return NULL_RTX;
11527
11528   /* We can load this constant by using DUP and a constant in a
11529      single ARM register.  This will be cheaper than a vector
11530      load.  */
11531   x = copy_to_mode_reg (inner_mode, x);
11532   return gen_rtx_VEC_DUPLICATE (mode, x);
11533 }
11534
11535
11536 /* Generate code to load VALS, which is a PARALLEL containing only
11537    constants (for vec_init) or CONST_VECTOR, efficiently into a
11538    register.  Returns an RTX to copy into the register, or NULL_RTX
11539    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11540 static rtx
11541 aarch64_simd_make_constant (rtx vals)
11542 {
11543   machine_mode mode = GET_MODE (vals);
11544   rtx const_dup;
11545   rtx const_vec = NULL_RTX;
11546   int n_elts = GET_MODE_NUNITS (mode);
11547   int n_const = 0;
11548   int i;
11549
11550   if (GET_CODE (vals) == CONST_VECTOR)
11551     const_vec = vals;
11552   else if (GET_CODE (vals) == PARALLEL)
11553     {
11554       /* A CONST_VECTOR must contain only CONST_INTs and
11555          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11556          Only store valid constants in a CONST_VECTOR.  */
11557       for (i = 0; i < n_elts; ++i)
11558         {
11559           rtx x = XVECEXP (vals, 0, i);
11560           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11561             n_const++;
11562         }
11563       if (n_const == n_elts)
11564         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11565     }
11566   else
11567     gcc_unreachable ();
11568
11569   if (const_vec != NULL_RTX
11570       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11571     /* Load using MOVI/MVNI.  */
11572     return const_vec;
11573   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11574     /* Loaded using DUP.  */
11575     return const_dup;
11576   else if (const_vec != NULL_RTX)
11577     /* Load from constant pool. We can not take advantage of single-cycle
11578        LD1 because we need a PC-relative addressing mode.  */
11579     return const_vec;
11580   else
11581     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11582        We can not construct an initializer.  */
11583     return NULL_RTX;
11584 }
11585
11586 /* Expand a vector initialisation sequence, such that TARGET is
11587    initialised to contain VALS.  */
11588
11589 void
11590 aarch64_expand_vector_init (rtx target, rtx vals)
11591 {
11592   machine_mode mode = GET_MODE (target);
11593   machine_mode inner_mode = GET_MODE_INNER (mode);
11594   /* The number of vector elements.  */
11595   int n_elts = GET_MODE_NUNITS (mode);
11596   /* The number of vector elements which are not constant.  */
11597   int n_var = 0;
11598   rtx any_const = NULL_RTX;
11599   /* The first element of vals.  */
11600   rtx v0 = XVECEXP (vals, 0, 0);
11601   bool all_same = true;
11602
11603   /* Count the number of variable elements to initialise.  */
11604   for (int i = 0; i < n_elts; ++i)
11605     {
11606       rtx x = XVECEXP (vals, 0, i);
11607       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11608         ++n_var;
11609       else
11610         any_const = x;
11611
11612       all_same &= rtx_equal_p (x, v0);
11613     }
11614
11615   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11616      how best to handle this.  */
11617   if (n_var == 0)
11618     {
11619       rtx constant = aarch64_simd_make_constant (vals);
11620       if (constant != NULL_RTX)
11621         {
11622           emit_move_insn (target, constant);
11623           return;
11624         }
11625     }
11626
11627   /* Splat a single non-constant element if we can.  */
11628   if (all_same)
11629     {
11630       rtx x = copy_to_mode_reg (inner_mode, v0);
11631       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11632       return;
11633     }
11634
11635   /* Initialise a vector which is part-variable.  We want to first try
11636      to build those lanes which are constant in the most efficient way we
11637      can.  */
11638   if (n_var != n_elts)
11639     {
11640       rtx copy = copy_rtx (vals);
11641
11642       /* Load constant part of vector.  We really don't care what goes into the
11643          parts we will overwrite, but we're more likely to be able to load the
11644          constant efficiently if it has fewer, larger, repeating parts
11645          (see aarch64_simd_valid_immediate).  */
11646       for (int i = 0; i < n_elts; i++)
11647         {
11648           rtx x = XVECEXP (vals, 0, i);
11649           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11650             continue;
11651           rtx subst = any_const;
11652           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11653             {
11654               /* Look in the copied vector, as more elements are const.  */
11655               rtx test = XVECEXP (copy, 0, i ^ bit);
11656               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11657                 {
11658                   subst = test;
11659                   break;
11660                 }
11661             }
11662           XVECEXP (copy, 0, i) = subst;
11663         }
11664       aarch64_expand_vector_init (target, copy);
11665     }
11666
11667   /* Insert the variable lanes directly.  */
11668
11669   enum insn_code icode = optab_handler (vec_set_optab, mode);
11670   gcc_assert (icode != CODE_FOR_nothing);
11671
11672   for (int i = 0; i < n_elts; i++)
11673     {
11674       rtx x = XVECEXP (vals, 0, i);
11675       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11676         continue;
11677       x = copy_to_mode_reg (inner_mode, x);
11678       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11679     }
11680 }
11681
11682 static unsigned HOST_WIDE_INT
11683 aarch64_shift_truncation_mask (machine_mode mode)
11684 {
11685   return
11686     (!SHIFT_COUNT_TRUNCATED
11687      || aarch64_vector_mode_supported_p (mode)
11688      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11689 }
11690
11691 /* Select a format to encode pointers in exception handling data.  */
11692 int
11693 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11694 {
11695    int type;
11696    switch (aarch64_cmodel)
11697      {
11698      case AARCH64_CMODEL_TINY:
11699      case AARCH64_CMODEL_TINY_PIC:
11700      case AARCH64_CMODEL_SMALL:
11701      case AARCH64_CMODEL_SMALL_PIC:
11702      case AARCH64_CMODEL_SMALL_SPIC:
11703        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11704           for everything.  */
11705        type = DW_EH_PE_sdata4;
11706        break;
11707      default:
11708        /* No assumptions here.  8-byte relocs required.  */
11709        type = DW_EH_PE_sdata8;
11710        break;
11711      }
11712    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11713 }
11714
11715 /* The last .arch and .tune assembly strings that we printed.  */
11716 static std::string aarch64_last_printed_arch_string;
11717 static std::string aarch64_last_printed_tune_string;
11718
11719 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11720    by the function fndecl.  */
11721
11722 void
11723 aarch64_declare_function_name (FILE *stream, const char* name,
11724                                 tree fndecl)
11725 {
11726   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11727
11728   struct cl_target_option *targ_options;
11729   if (target_parts)
11730     targ_options = TREE_TARGET_OPTION (target_parts);
11731   else
11732     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11733   gcc_assert (targ_options);
11734
11735   const struct processor *this_arch
11736     = aarch64_get_arch (targ_options->x_explicit_arch);
11737
11738   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11739   std::string extension
11740     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11741                                                   this_arch->flags);
11742   /* Only update the assembler .arch string if it is distinct from the last
11743      such string we printed.  */
11744   std::string to_print = this_arch->name + extension;
11745   if (to_print != aarch64_last_printed_arch_string)
11746     {
11747       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11748       aarch64_last_printed_arch_string = to_print;
11749     }
11750
11751   /* Print the cpu name we're tuning for in the comments, might be
11752      useful to readers of the generated asm.  Do it only when it changes
11753      from function to function and verbose assembly is requested.  */
11754   const struct processor *this_tune
11755     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11756
11757   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11758     {
11759       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11760                    this_tune->name);
11761       aarch64_last_printed_tune_string = this_tune->name;
11762     }
11763
11764   /* Don't forget the type directive for ELF.  */
11765   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11766   ASM_OUTPUT_LABEL (stream, name);
11767 }
11768
11769 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11770
11771 static void
11772 aarch64_start_file (void)
11773 {
11774   struct cl_target_option *default_options
11775     = TREE_TARGET_OPTION (target_option_default_node);
11776
11777   const struct processor *default_arch
11778     = aarch64_get_arch (default_options->x_explicit_arch);
11779   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11780   std::string extension
11781     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11782                                                   default_arch->flags);
11783
11784    aarch64_last_printed_arch_string = default_arch->name + extension;
11785    aarch64_last_printed_tune_string = "";
11786    asm_fprintf (asm_out_file, "\t.arch %s\n",
11787                 aarch64_last_printed_arch_string.c_str ());
11788
11789    default_file_start ();
11790 }
11791
11792 /* Emit load exclusive.  */
11793
11794 static void
11795 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11796                              rtx mem, rtx model_rtx)
11797 {
11798   rtx (*gen) (rtx, rtx, rtx);
11799
11800   switch (mode)
11801     {
11802     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11803     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11804     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11805     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11806     default:
11807       gcc_unreachable ();
11808     }
11809
11810   emit_insn (gen (rval, mem, model_rtx));
11811 }
11812
11813 /* Emit store exclusive.  */
11814
11815 static void
11816 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11817                               rtx rval, rtx mem, rtx model_rtx)
11818 {
11819   rtx (*gen) (rtx, rtx, rtx, rtx);
11820
11821   switch (mode)
11822     {
11823     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11824     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11825     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11826     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11827     default:
11828       gcc_unreachable ();
11829     }
11830
11831   emit_insn (gen (bval, rval, mem, model_rtx));
11832 }
11833
11834 /* Mark the previous jump instruction as unlikely.  */
11835
11836 static void
11837 aarch64_emit_unlikely_jump (rtx insn)
11838 {
11839   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11840
11841   rtx_insn *jump = emit_jump_insn (insn);
11842   add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11843 }
11844
11845 /* Expand a compare and swap pattern.  */
11846
11847 void
11848 aarch64_expand_compare_and_swap (rtx operands[])
11849 {
11850   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11851   machine_mode mode, cmp_mode;
11852   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11853   int idx;
11854   gen_cas_fn gen;
11855   const gen_cas_fn split_cas[] =
11856   {
11857     gen_aarch64_compare_and_swapqi,
11858     gen_aarch64_compare_and_swaphi,
11859     gen_aarch64_compare_and_swapsi,
11860     gen_aarch64_compare_and_swapdi
11861   };
11862   const gen_cas_fn atomic_cas[] =
11863   {
11864     gen_aarch64_compare_and_swapqi_lse,
11865     gen_aarch64_compare_and_swaphi_lse,
11866     gen_aarch64_compare_and_swapsi_lse,
11867     gen_aarch64_compare_and_swapdi_lse
11868   };
11869
11870   bval = operands[0];
11871   rval = operands[1];
11872   mem = operands[2];
11873   oldval = operands[3];
11874   newval = operands[4];
11875   is_weak = operands[5];
11876   mod_s = operands[6];
11877   mod_f = operands[7];
11878   mode = GET_MODE (mem);
11879   cmp_mode = mode;
11880
11881   /* Normally the succ memory model must be stronger than fail, but in the
11882      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11883      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11884
11885   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11886       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11887     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11888
11889   switch (mode)
11890     {
11891     case QImode:
11892     case HImode:
11893       /* For short modes, we're going to perform the comparison in SImode,
11894          so do the zero-extension now.  */
11895       cmp_mode = SImode;
11896       rval = gen_reg_rtx (SImode);
11897       oldval = convert_modes (SImode, mode, oldval, true);
11898       /* Fall through.  */
11899
11900     case SImode:
11901     case DImode:
11902       /* Force the value into a register if needed.  */
11903       if (!aarch64_plus_operand (oldval, mode))
11904         oldval = force_reg (cmp_mode, oldval);
11905       break;
11906
11907     default:
11908       gcc_unreachable ();
11909     }
11910
11911   switch (mode)
11912     {
11913     case QImode: idx = 0; break;
11914     case HImode: idx = 1; break;
11915     case SImode: idx = 2; break;
11916     case DImode: idx = 3; break;
11917     default:
11918       gcc_unreachable ();
11919     }
11920   if (TARGET_LSE)
11921     gen = atomic_cas[idx];
11922   else
11923     gen = split_cas[idx];
11924
11925   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11926
11927   if (mode == QImode || mode == HImode)
11928     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11929
11930   x = gen_rtx_REG (CCmode, CC_REGNUM);
11931   x = gen_rtx_EQ (SImode, x, const0_rtx);
11932   emit_insn (gen_rtx_SET (bval, x));
11933 }
11934
11935 /* Test whether the target supports using a atomic load-operate instruction.
11936    CODE is the operation and AFTER is TRUE if the data in memory after the
11937    operation should be returned and FALSE if the data before the operation
11938    should be returned.  Returns FALSE if the operation isn't supported by the
11939    architecture.  */
11940
11941 bool
11942 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11943 {
11944   if (!TARGET_LSE)
11945     return false;
11946
11947   switch (code)
11948     {
11949     case SET:
11950     case AND:
11951     case IOR:
11952     case XOR:
11953     case MINUS:
11954     case PLUS:
11955       return true;
11956     default:
11957       return false;
11958     }
11959 }
11960
11961 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11962    sequence implementing an atomic operation.  */
11963
11964 static void
11965 aarch64_emit_post_barrier (enum memmodel model)
11966 {
11967   const enum memmodel base_model = memmodel_base (model);
11968
11969   if (is_mm_sync (model)
11970       && (base_model == MEMMODEL_ACQUIRE
11971           || base_model == MEMMODEL_ACQ_REL
11972           || base_model == MEMMODEL_SEQ_CST))
11973     {
11974       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11975     }
11976 }
11977
11978 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11979    for the data in memory.  EXPECTED is the value expected to be in memory.
11980    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11981    is the memory ordering to use.  */
11982
11983 void
11984 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11985                         rtx expected, rtx desired,
11986                         rtx model)
11987 {
11988   rtx (*gen) (rtx, rtx, rtx, rtx);
11989   machine_mode mode;
11990
11991   mode = GET_MODE (mem);
11992
11993   switch (mode)
11994     {
11995     case QImode: gen = gen_aarch64_atomic_casqi; break;
11996     case HImode: gen = gen_aarch64_atomic_cashi; break;
11997     case SImode: gen = gen_aarch64_atomic_cassi; break;
11998     case DImode: gen = gen_aarch64_atomic_casdi; break;
11999     default:
12000       gcc_unreachable ();
12001     }
12002
12003   /* Move the expected value into the CAS destination register.  */
12004   emit_insn (gen_rtx_SET (rval, expected));
12005
12006   /* Emit the CAS.  */
12007   emit_insn (gen (rval, mem, desired, model));
12008
12009   /* Compare the expected value with the value loaded by the CAS, to establish
12010      whether the swap was made.  */
12011   aarch64_gen_compare_reg (EQ, rval, expected);
12012 }
12013
12014 /* Split a compare and swap pattern.  */
12015
12016 void
12017 aarch64_split_compare_and_swap (rtx operands[])
12018 {
12019   rtx rval, mem, oldval, newval, scratch;
12020   machine_mode mode;
12021   bool is_weak;
12022   rtx_code_label *label1, *label2;
12023   rtx x, cond;
12024   enum memmodel model;
12025   rtx model_rtx;
12026
12027   rval = operands[0];
12028   mem = operands[1];
12029   oldval = operands[2];
12030   newval = operands[3];
12031   is_weak = (operands[4] != const0_rtx);
12032   model_rtx = operands[5];
12033   scratch = operands[7];
12034   mode = GET_MODE (mem);
12035   model = memmodel_from_int (INTVAL (model_rtx));
12036
12037   label1 = NULL;
12038   if (!is_weak)
12039     {
12040       label1 = gen_label_rtx ();
12041       emit_label (label1);
12042     }
12043   label2 = gen_label_rtx ();
12044
12045   /* The initial load can be relaxed for a __sync operation since a final
12046      barrier will be emitted to stop code hoisting.  */
12047   if (is_mm_sync (model))
12048     aarch64_emit_load_exclusive (mode, rval, mem,
12049                                  GEN_INT (MEMMODEL_RELAXED));
12050   else
12051     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12052
12053   cond = aarch64_gen_compare_reg (NE, rval, oldval);
12054   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12055   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12056                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12057   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12058
12059   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12060
12061   if (!is_weak)
12062     {
12063       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12064       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12065                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12066       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12067     }
12068   else
12069     {
12070       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12071       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12072       emit_insn (gen_rtx_SET (cond, x));
12073     }
12074
12075   emit_label (label2);
12076
12077   /* Emit any final barrier needed for a __sync operation.  */
12078   if (is_mm_sync (model))
12079     aarch64_emit_post_barrier (model);
12080 }
12081
12082 /* Emit a BIC instruction.  */
12083
12084 static void
12085 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12086 {
12087   rtx shift_rtx = GEN_INT (shift);
12088   rtx (*gen) (rtx, rtx, rtx, rtx);
12089
12090   switch (mode)
12091     {
12092     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12093     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12094     default:
12095       gcc_unreachable ();
12096     }
12097
12098   emit_insn (gen (dst, s2, shift_rtx, s1));
12099 }
12100
12101 /* Emit an atomic swap.  */
12102
12103 static void
12104 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12105                           rtx mem, rtx model)
12106 {
12107   rtx (*gen) (rtx, rtx, rtx, rtx);
12108
12109   switch (mode)
12110     {
12111     case QImode: gen = gen_aarch64_atomic_swpqi; break;
12112     case HImode: gen = gen_aarch64_atomic_swphi; break;
12113     case SImode: gen = gen_aarch64_atomic_swpsi; break;
12114     case DImode: gen = gen_aarch64_atomic_swpdi; break;
12115     default:
12116       gcc_unreachable ();
12117     }
12118
12119   emit_insn (gen (dst, mem, value, model));
12120 }
12121
12122 /* Operations supported by aarch64_emit_atomic_load_op.  */
12123
12124 enum aarch64_atomic_load_op_code
12125 {
12126   AARCH64_LDOP_PLUS,    /* A + B  */
12127   AARCH64_LDOP_XOR,     /* A ^ B  */
12128   AARCH64_LDOP_OR,      /* A | B  */
12129   AARCH64_LDOP_BIC      /* A & ~B  */
12130 };
12131
12132 /* Emit an atomic load-operate.  */
12133
12134 static void
12135 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12136                              machine_mode mode, rtx dst, rtx src,
12137                              rtx mem, rtx model)
12138 {
12139   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12140   const aarch64_atomic_load_op_fn plus[] =
12141   {
12142     gen_aarch64_atomic_loadaddqi,
12143     gen_aarch64_atomic_loadaddhi,
12144     gen_aarch64_atomic_loadaddsi,
12145     gen_aarch64_atomic_loadadddi
12146   };
12147   const aarch64_atomic_load_op_fn eor[] =
12148   {
12149     gen_aarch64_atomic_loadeorqi,
12150     gen_aarch64_atomic_loadeorhi,
12151     gen_aarch64_atomic_loadeorsi,
12152     gen_aarch64_atomic_loadeordi
12153   };
12154   const aarch64_atomic_load_op_fn ior[] =
12155   {
12156     gen_aarch64_atomic_loadsetqi,
12157     gen_aarch64_atomic_loadsethi,
12158     gen_aarch64_atomic_loadsetsi,
12159     gen_aarch64_atomic_loadsetdi
12160   };
12161   const aarch64_atomic_load_op_fn bic[] =
12162   {
12163     gen_aarch64_atomic_loadclrqi,
12164     gen_aarch64_atomic_loadclrhi,
12165     gen_aarch64_atomic_loadclrsi,
12166     gen_aarch64_atomic_loadclrdi
12167   };
12168   aarch64_atomic_load_op_fn gen;
12169   int idx = 0;
12170
12171   switch (mode)
12172     {
12173     case QImode: idx = 0; break;
12174     case HImode: idx = 1; break;
12175     case SImode: idx = 2; break;
12176     case DImode: idx = 3; break;
12177     default:
12178       gcc_unreachable ();
12179     }
12180
12181   switch (code)
12182     {
12183     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12184     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12185     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12186     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12187     default:
12188       gcc_unreachable ();
12189     }
12190
12191   emit_insn (gen (dst, mem, src, model));
12192 }
12193
12194 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12195    location to store the data read from memory.  OUT_RESULT is the location to
12196    store the result of the operation.  MEM is the memory location to read and
12197    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12198    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12199    be NULL.  */
12200
12201 void
12202 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12203                          rtx mem, rtx value, rtx model_rtx)
12204 {
12205   machine_mode mode = GET_MODE (mem);
12206   machine_mode wmode = (mode == DImode ? DImode : SImode);
12207   const bool short_mode = (mode < SImode);
12208   aarch64_atomic_load_op_code ldop_code;
12209   rtx src;
12210   rtx x;
12211
12212   if (out_data)
12213     out_data = gen_lowpart (mode, out_data);
12214
12215   if (out_result)
12216     out_result = gen_lowpart (mode, out_result);
12217
12218   /* Make sure the value is in a register, putting it into a destination
12219      register if it needs to be manipulated.  */
12220   if (!register_operand (value, mode)
12221       || code == AND || code == MINUS)
12222     {
12223       src = out_result ? out_result : out_data;
12224       emit_move_insn (src, gen_lowpart (mode, value));
12225     }
12226   else
12227     src = value;
12228   gcc_assert (register_operand (src, mode));
12229
12230   /* Preprocess the data for the operation as necessary.  If the operation is
12231      a SET then emit a swap instruction and finish.  */
12232   switch (code)
12233     {
12234     case SET:
12235       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12236       return;
12237
12238     case MINUS:
12239       /* Negate the value and treat it as a PLUS.  */
12240       {
12241         rtx neg_src;
12242
12243         /* Resize the value if necessary.  */
12244         if (short_mode)
12245           src = gen_lowpart (wmode, src);
12246
12247         neg_src = gen_rtx_NEG (wmode, src);
12248         emit_insn (gen_rtx_SET (src, neg_src));
12249
12250         if (short_mode)
12251           src = gen_lowpart (mode, src);
12252       }
12253       /* Fall-through.  */
12254     case PLUS:
12255       ldop_code = AARCH64_LDOP_PLUS;
12256       break;
12257
12258     case IOR:
12259       ldop_code = AARCH64_LDOP_OR;
12260       break;
12261
12262     case XOR:
12263       ldop_code = AARCH64_LDOP_XOR;
12264       break;
12265
12266     case AND:
12267       {
12268         rtx not_src;
12269
12270         /* Resize the value if necessary.  */
12271         if (short_mode)
12272           src = gen_lowpart (wmode, src);
12273
12274         not_src = gen_rtx_NOT (wmode, src);
12275         emit_insn (gen_rtx_SET (src, not_src));
12276
12277         if (short_mode)
12278           src = gen_lowpart (mode, src);
12279       }
12280       ldop_code = AARCH64_LDOP_BIC;
12281       break;
12282
12283     default:
12284       /* The operation can't be done with atomic instructions.  */
12285       gcc_unreachable ();
12286     }
12287
12288   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12289
12290   /* If necessary, calculate the data in memory after the update by redoing the
12291      operation from values in registers.  */
12292   if (!out_result)
12293     return;
12294
12295   if (short_mode)
12296     {
12297       src = gen_lowpart (wmode, src);
12298       out_data = gen_lowpart (wmode, out_data);
12299       out_result = gen_lowpart (wmode, out_result);
12300     }
12301
12302   x = NULL_RTX;
12303
12304   switch (code)
12305     {
12306     case MINUS:
12307     case PLUS:
12308       x = gen_rtx_PLUS (wmode, out_data, src);
12309       break;
12310     case IOR:
12311       x = gen_rtx_IOR (wmode, out_data, src);
12312       break;
12313     case XOR:
12314       x = gen_rtx_XOR (wmode, out_data, src);
12315       break;
12316     case AND:
12317       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12318       return;
12319     default:
12320       gcc_unreachable ();
12321     }
12322
12323   emit_set_insn (out_result, x);
12324
12325   return;
12326 }
12327
12328 /* Split an atomic operation.  */
12329
12330 void
12331 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12332                          rtx value, rtx model_rtx, rtx cond)
12333 {
12334   machine_mode mode = GET_MODE (mem);
12335   machine_mode wmode = (mode == DImode ? DImode : SImode);
12336   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12337   const bool is_sync = is_mm_sync (model);
12338   rtx_code_label *label;
12339   rtx x;
12340
12341   /* Split the atomic operation into a sequence.  */
12342   label = gen_label_rtx ();
12343   emit_label (label);
12344
12345   if (new_out)
12346     new_out = gen_lowpart (wmode, new_out);
12347   if (old_out)
12348     old_out = gen_lowpart (wmode, old_out);
12349   else
12350     old_out = new_out;
12351   value = simplify_gen_subreg (wmode, value, mode, 0);
12352
12353   /* The initial load can be relaxed for a __sync operation since a final
12354      barrier will be emitted to stop code hoisting.  */
12355  if (is_sync)
12356     aarch64_emit_load_exclusive (mode, old_out, mem,
12357                                  GEN_INT (MEMMODEL_RELAXED));
12358   else
12359     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12360
12361   switch (code)
12362     {
12363     case SET:
12364       new_out = value;
12365       break;
12366
12367     case NOT:
12368       x = gen_rtx_AND (wmode, old_out, value);
12369       emit_insn (gen_rtx_SET (new_out, x));
12370       x = gen_rtx_NOT (wmode, new_out);
12371       emit_insn (gen_rtx_SET (new_out, x));
12372       break;
12373
12374     case MINUS:
12375       if (CONST_INT_P (value))
12376         {
12377           value = GEN_INT (-INTVAL (value));
12378           code = PLUS;
12379         }
12380       /* Fall through.  */
12381
12382     default:
12383       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12384       emit_insn (gen_rtx_SET (new_out, x));
12385       break;
12386     }
12387
12388   aarch64_emit_store_exclusive (mode, cond, mem,
12389                                 gen_lowpart (mode, new_out), model_rtx);
12390
12391   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12392   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12393                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12394   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12395
12396   /* Emit any final barrier needed for a __sync operation.  */
12397   if (is_sync)
12398     aarch64_emit_post_barrier (model);
12399 }
12400
12401 static void
12402 aarch64_init_libfuncs (void)
12403 {
12404    /* Half-precision float operations.  The compiler handles all operations
12405      with NULL libfuncs by converting to SFmode.  */
12406
12407   /* Conversions.  */
12408   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12409   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12410
12411   /* Arithmetic.  */
12412   set_optab_libfunc (add_optab, HFmode, NULL);
12413   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12414   set_optab_libfunc (smul_optab, HFmode, NULL);
12415   set_optab_libfunc (neg_optab, HFmode, NULL);
12416   set_optab_libfunc (sub_optab, HFmode, NULL);
12417
12418   /* Comparisons.  */
12419   set_optab_libfunc (eq_optab, HFmode, NULL);
12420   set_optab_libfunc (ne_optab, HFmode, NULL);
12421   set_optab_libfunc (lt_optab, HFmode, NULL);
12422   set_optab_libfunc (le_optab, HFmode, NULL);
12423   set_optab_libfunc (ge_optab, HFmode, NULL);
12424   set_optab_libfunc (gt_optab, HFmode, NULL);
12425   set_optab_libfunc (unord_optab, HFmode, NULL);
12426 }
12427
12428 /* Target hook for c_mode_for_suffix.  */
12429 static machine_mode
12430 aarch64_c_mode_for_suffix (char suffix)
12431 {
12432   if (suffix == 'q')
12433     return TFmode;
12434
12435   return VOIDmode;
12436 }
12437
12438 /* We can only represent floating point constants which will fit in
12439    "quarter-precision" values.  These values are characterised by
12440    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12441    by:
12442
12443    (-1)^s * (n/16) * 2^r
12444
12445    Where:
12446      's' is the sign bit.
12447      'n' is an integer in the range 16 <= n <= 31.
12448      'r' is an integer in the range -3 <= r <= 4.  */
12449
12450 /* Return true iff X can be represented by a quarter-precision
12451    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12452 bool
12453 aarch64_float_const_representable_p (rtx x)
12454 {
12455   /* This represents our current view of how many bits
12456      make up the mantissa.  */
12457   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12458   int exponent;
12459   unsigned HOST_WIDE_INT mantissa, mask;
12460   REAL_VALUE_TYPE r, m;
12461   bool fail;
12462
12463   if (!CONST_DOUBLE_P (x))
12464     return false;
12465
12466   /* We don't support HFmode constants yet.  */
12467   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12468     return false;
12469
12470   r = *CONST_DOUBLE_REAL_VALUE (x);
12471
12472   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12473      know if we have +zero until we analyse the mantissa, but we
12474      can reject the other invalid values.  */
12475   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12476       || REAL_VALUE_MINUS_ZERO (r))
12477     return false;
12478
12479   /* Extract exponent.  */
12480   r = real_value_abs (&r);
12481   exponent = REAL_EXP (&r);
12482
12483   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12484      highest (sign) bit, with a fixed binary point at bit point_pos.
12485      m1 holds the low part of the mantissa, m2 the high part.
12486      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12487      bits for the mantissa, this can fail (low bits will be lost).  */
12488   real_ldexp (&m, &r, point_pos - exponent);
12489   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12490
12491   /* If the low part of the mantissa has bits set we cannot represent
12492      the value.  */
12493   if (w.elt (0) != 0)
12494     return false;
12495   /* We have rejected the lower HOST_WIDE_INT, so update our
12496      understanding of how many bits lie in the mantissa and
12497      look only at the high HOST_WIDE_INT.  */
12498   mantissa = w.elt (1);
12499   point_pos -= HOST_BITS_PER_WIDE_INT;
12500
12501   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12502   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12503   if ((mantissa & mask) != 0)
12504     return false;
12505
12506   /* Having filtered unrepresentable values, we may now remove all
12507      but the highest 5 bits.  */
12508   mantissa >>= point_pos - 5;
12509
12510   /* We cannot represent the value 0.0, so reject it.  This is handled
12511      elsewhere.  */
12512   if (mantissa == 0)
12513     return false;
12514
12515   /* Then, as bit 4 is always set, we can mask it off, leaving
12516      the mantissa in the range [0, 15].  */
12517   mantissa &= ~(1 << 4);
12518   gcc_assert (mantissa <= 15);
12519
12520   /* GCC internally does not use IEEE754-like encoding (where normalized
12521      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12522      Our mantissa values are shifted 4 places to the left relative to
12523      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12524      by 5 places to correct for GCC's representation.  */
12525   exponent = 5 - exponent;
12526
12527   return (exponent >= 0 && exponent <= 7);
12528 }
12529
12530 char*
12531 aarch64_output_simd_mov_immediate (rtx const_vector,
12532                                    machine_mode mode,
12533                                    unsigned width)
12534 {
12535   bool is_valid;
12536   static char templ[40];
12537   const char *mnemonic;
12538   const char *shift_op;
12539   unsigned int lane_count = 0;
12540   char element_char;
12541
12542   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12543
12544   /* This will return true to show const_vector is legal for use as either
12545      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12546      also update INFO to show how the immediate should be generated.  */
12547   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12548   gcc_assert (is_valid);
12549
12550   element_char = sizetochar (info.element_width);
12551   lane_count = width / info.element_width;
12552
12553   mode = GET_MODE_INNER (mode);
12554   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12555     {
12556       gcc_assert (info.shift == 0 && ! info.mvn);
12557       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12558          move immediate path.  */
12559       if (aarch64_float_const_zero_rtx_p (info.value))
12560         info.value = GEN_INT (0);
12561       else
12562         {
12563           const unsigned int buf_size = 20;
12564           char float_buf[buf_size] = {'\0'};
12565           real_to_decimal_for_mode (float_buf,
12566                                     CONST_DOUBLE_REAL_VALUE (info.value),
12567                                     buf_size, buf_size, 1, mode);
12568
12569           if (lane_count == 1)
12570             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12571           else
12572             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12573                       lane_count, element_char, float_buf);
12574           return templ;
12575         }
12576     }
12577
12578   mnemonic = info.mvn ? "mvni" : "movi";
12579   shift_op = info.msl ? "msl" : "lsl";
12580
12581   gcc_assert (CONST_INT_P (info.value));
12582   if (lane_count == 1)
12583     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12584               mnemonic, UINTVAL (info.value));
12585   else if (info.shift)
12586     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12587               ", %s %d", mnemonic, lane_count, element_char,
12588               UINTVAL (info.value), shift_op, info.shift);
12589   else
12590     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12591               mnemonic, lane_count, element_char, UINTVAL (info.value));
12592   return templ;
12593 }
12594
12595 char*
12596 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12597                                           machine_mode mode)
12598 {
12599   machine_mode vmode;
12600
12601   gcc_assert (!VECTOR_MODE_P (mode));
12602   vmode = aarch64_simd_container_mode (mode, 64);
12603   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12604   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12605 }
12606
12607 /* Split operands into moves from op[1] + op[2] into op[0].  */
12608
12609 void
12610 aarch64_split_combinev16qi (rtx operands[3])
12611 {
12612   unsigned int dest = REGNO (operands[0]);
12613   unsigned int src1 = REGNO (operands[1]);
12614   unsigned int src2 = REGNO (operands[2]);
12615   machine_mode halfmode = GET_MODE (operands[1]);
12616   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12617   rtx destlo, desthi;
12618
12619   gcc_assert (halfmode == V16QImode);
12620
12621   if (src1 == dest && src2 == dest + halfregs)
12622     {
12623       /* No-op move.  Can't split to nothing; emit something.  */
12624       emit_note (NOTE_INSN_DELETED);
12625       return;
12626     }
12627
12628   /* Preserve register attributes for variable tracking.  */
12629   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12630   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12631                                GET_MODE_SIZE (halfmode));
12632
12633   /* Special case of reversed high/low parts.  */
12634   if (reg_overlap_mentioned_p (operands[2], destlo)
12635       && reg_overlap_mentioned_p (operands[1], desthi))
12636     {
12637       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12638       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12639       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12640     }
12641   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12642     {
12643       /* Try to avoid unnecessary moves if part of the result
12644          is in the right place already.  */
12645       if (src1 != dest)
12646         emit_move_insn (destlo, operands[1]);
12647       if (src2 != dest + halfregs)
12648         emit_move_insn (desthi, operands[2]);
12649     }
12650   else
12651     {
12652       if (src2 != dest + halfregs)
12653         emit_move_insn (desthi, operands[2]);
12654       if (src1 != dest)
12655         emit_move_insn (destlo, operands[1]);
12656     }
12657 }
12658
12659 /* vec_perm support.  */
12660
12661 #define MAX_VECT_LEN 16
12662
12663 struct expand_vec_perm_d
12664 {
12665   rtx target, op0, op1;
12666   unsigned char perm[MAX_VECT_LEN];
12667   machine_mode vmode;
12668   unsigned char nelt;
12669   bool one_vector_p;
12670   bool testing_p;
12671 };
12672
12673 /* Generate a variable permutation.  */
12674
12675 static void
12676 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12677 {
12678   machine_mode vmode = GET_MODE (target);
12679   bool one_vector_p = rtx_equal_p (op0, op1);
12680
12681   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12682   gcc_checking_assert (GET_MODE (op0) == vmode);
12683   gcc_checking_assert (GET_MODE (op1) == vmode);
12684   gcc_checking_assert (GET_MODE (sel) == vmode);
12685   gcc_checking_assert (TARGET_SIMD);
12686
12687   if (one_vector_p)
12688     {
12689       if (vmode == V8QImode)
12690         {
12691           /* Expand the argument to a V16QI mode by duplicating it.  */
12692           rtx pair = gen_reg_rtx (V16QImode);
12693           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12694           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12695         }
12696       else
12697         {
12698           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12699         }
12700     }
12701   else
12702     {
12703       rtx pair;
12704
12705       if (vmode == V8QImode)
12706         {
12707           pair = gen_reg_rtx (V16QImode);
12708           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12709           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12710         }
12711       else
12712         {
12713           pair = gen_reg_rtx (OImode);
12714           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12715           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12716         }
12717     }
12718 }
12719
12720 void
12721 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12722 {
12723   machine_mode vmode = GET_MODE (target);
12724   unsigned int nelt = GET_MODE_NUNITS (vmode);
12725   bool one_vector_p = rtx_equal_p (op0, op1);
12726   rtx mask;
12727
12728   /* The TBL instruction does not use a modulo index, so we must take care
12729      of that ourselves.  */
12730   mask = aarch64_simd_gen_const_vector_dup (vmode,
12731       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12732   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12733
12734   /* For big-endian, we also need to reverse the index within the vector
12735      (but not which vector).  */
12736   if (BYTES_BIG_ENDIAN)
12737     {
12738       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12739       if (!one_vector_p)
12740         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12741       sel = expand_simple_binop (vmode, XOR, sel, mask,
12742                                  NULL, 0, OPTAB_LIB_WIDEN);
12743     }
12744   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12745 }
12746
12747 /* Recognize patterns suitable for the TRN instructions.  */
12748 static bool
12749 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12750 {
12751   unsigned int i, odd, mask, nelt = d->nelt;
12752   rtx out, in0, in1, x;
12753   rtx (*gen) (rtx, rtx, rtx);
12754   machine_mode vmode = d->vmode;
12755
12756   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12757     return false;
12758
12759   /* Note that these are little-endian tests.
12760      We correct for big-endian later.  */
12761   if (d->perm[0] == 0)
12762     odd = 0;
12763   else if (d->perm[0] == 1)
12764     odd = 1;
12765   else
12766     return false;
12767   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12768
12769   for (i = 0; i < nelt; i += 2)
12770     {
12771       if (d->perm[i] != i + odd)
12772         return false;
12773       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12774         return false;
12775     }
12776
12777   /* Success!  */
12778   if (d->testing_p)
12779     return true;
12780
12781   in0 = d->op0;
12782   in1 = d->op1;
12783   if (BYTES_BIG_ENDIAN)
12784     {
12785       x = in0, in0 = in1, in1 = x;
12786       odd = !odd;
12787     }
12788   out = d->target;
12789
12790   if (odd)
12791     {
12792       switch (vmode)
12793         {
12794         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12795         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12796         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12797         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12798         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12799         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12800         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12801         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12802         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12803         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12804         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12805         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12806         default:
12807           return false;
12808         }
12809     }
12810   else
12811     {
12812       switch (vmode)
12813         {
12814         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12815         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12816         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12817         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12818         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12819         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12820         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12821         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12822         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12823         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12824         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12825         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12826         default:
12827           return false;
12828         }
12829     }
12830
12831   emit_insn (gen (out, in0, in1));
12832   return true;
12833 }
12834
12835 /* Recognize patterns suitable for the UZP instructions.  */
12836 static bool
12837 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12838 {
12839   unsigned int i, odd, mask, nelt = d->nelt;
12840   rtx out, in0, in1, x;
12841   rtx (*gen) (rtx, rtx, rtx);
12842   machine_mode vmode = d->vmode;
12843
12844   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12845     return false;
12846
12847   /* Note that these are little-endian tests.
12848      We correct for big-endian later.  */
12849   if (d->perm[0] == 0)
12850     odd = 0;
12851   else if (d->perm[0] == 1)
12852     odd = 1;
12853   else
12854     return false;
12855   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12856
12857   for (i = 0; i < nelt; i++)
12858     {
12859       unsigned elt = (i * 2 + odd) & mask;
12860       if (d->perm[i] != elt)
12861         return false;
12862     }
12863
12864   /* Success!  */
12865   if (d->testing_p)
12866     return true;
12867
12868   in0 = d->op0;
12869   in1 = d->op1;
12870   if (BYTES_BIG_ENDIAN)
12871     {
12872       x = in0, in0 = in1, in1 = x;
12873       odd = !odd;
12874     }
12875   out = d->target;
12876
12877   if (odd)
12878     {
12879       switch (vmode)
12880         {
12881         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12882         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12883         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12884         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12885         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12886         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12887         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12888         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12889         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12890         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12891         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12892         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12893         default:
12894           return false;
12895         }
12896     }
12897   else
12898     {
12899       switch (vmode)
12900         {
12901         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12902         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12903         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12904         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12905         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12906         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12907         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12908         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12909         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12910         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12911         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12912         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12913         default:
12914           return false;
12915         }
12916     }
12917
12918   emit_insn (gen (out, in0, in1));
12919   return true;
12920 }
12921
12922 /* Recognize patterns suitable for the ZIP instructions.  */
12923 static bool
12924 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12925 {
12926   unsigned int i, high, mask, nelt = d->nelt;
12927   rtx out, in0, in1, x;
12928   rtx (*gen) (rtx, rtx, rtx);
12929   machine_mode vmode = d->vmode;
12930
12931   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12932     return false;
12933
12934   /* Note that these are little-endian tests.
12935      We correct for big-endian later.  */
12936   high = nelt / 2;
12937   if (d->perm[0] == high)
12938     /* Do Nothing.  */
12939     ;
12940   else if (d->perm[0] == 0)
12941     high = 0;
12942   else
12943     return false;
12944   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12945
12946   for (i = 0; i < nelt / 2; i++)
12947     {
12948       unsigned elt = (i + high) & mask;
12949       if (d->perm[i * 2] != elt)
12950         return false;
12951       elt = (elt + nelt) & mask;
12952       if (d->perm[i * 2 + 1] != elt)
12953         return false;
12954     }
12955
12956   /* Success!  */
12957   if (d->testing_p)
12958     return true;
12959
12960   in0 = d->op0;
12961   in1 = d->op1;
12962   if (BYTES_BIG_ENDIAN)
12963     {
12964       x = in0, in0 = in1, in1 = x;
12965       high = !high;
12966     }
12967   out = d->target;
12968
12969   if (high)
12970     {
12971       switch (vmode)
12972         {
12973         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12974         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12975         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12976         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12977         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12978         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12979         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12980         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12981         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12982         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12983         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12984         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12985         default:
12986           return false;
12987         }
12988     }
12989   else
12990     {
12991       switch (vmode)
12992         {
12993         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12994         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12995         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12996         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12997         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12998         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12999         case V2DImode: gen = gen_aarch64_zip1v2di; break;
13000         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13001         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13002         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13003         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13004         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13005         default:
13006           return false;
13007         }
13008     }
13009
13010   emit_insn (gen (out, in0, in1));
13011   return true;
13012 }
13013
13014 /* Recognize patterns for the EXT insn.  */
13015
13016 static bool
13017 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13018 {
13019   unsigned int i, nelt = d->nelt;
13020   rtx (*gen) (rtx, rtx, rtx, rtx);
13021   rtx offset;
13022
13023   unsigned int location = d->perm[0]; /* Always < nelt.  */
13024
13025   /* Check if the extracted indices are increasing by one.  */
13026   for (i = 1; i < nelt; i++)
13027     {
13028       unsigned int required = location + i;
13029       if (d->one_vector_p)
13030         {
13031           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13032           required &= (nelt - 1);
13033         }
13034       if (d->perm[i] != required)
13035         return false;
13036     }
13037
13038   switch (d->vmode)
13039     {
13040     case V16QImode: gen = gen_aarch64_extv16qi; break;
13041     case V8QImode: gen = gen_aarch64_extv8qi; break;
13042     case V4HImode: gen = gen_aarch64_extv4hi; break;
13043     case V8HImode: gen = gen_aarch64_extv8hi; break;
13044     case V2SImode: gen = gen_aarch64_extv2si; break;
13045     case V4SImode: gen = gen_aarch64_extv4si; break;
13046     case V4HFmode: gen = gen_aarch64_extv4hf; break;
13047     case V8HFmode: gen = gen_aarch64_extv8hf; break;
13048     case V2SFmode: gen = gen_aarch64_extv2sf; break;
13049     case V4SFmode: gen = gen_aarch64_extv4sf; break;
13050     case V2DImode: gen = gen_aarch64_extv2di; break;
13051     case V2DFmode: gen = gen_aarch64_extv2df; break;
13052     default:
13053       return false;
13054     }
13055
13056   /* Success! */
13057   if (d->testing_p)
13058     return true;
13059
13060   /* The case where (location == 0) is a no-op for both big- and little-endian,
13061      and is removed by the mid-end at optimization levels -O1 and higher.  */
13062
13063   if (BYTES_BIG_ENDIAN && (location != 0))
13064     {
13065       /* After setup, we want the high elements of the first vector (stored
13066          at the LSB end of the register), and the low elements of the second
13067          vector (stored at the MSB end of the register). So swap.  */
13068       std::swap (d->op0, d->op1);
13069       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13070       location = nelt - location;
13071     }
13072
13073   offset = GEN_INT (location);
13074   emit_insn (gen (d->target, d->op0, d->op1, offset));
13075   return true;
13076 }
13077
13078 /* Recognize patterns for the REV insns.  */
13079
13080 static bool
13081 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13082 {
13083   unsigned int i, j, diff, nelt = d->nelt;
13084   rtx (*gen) (rtx, rtx);
13085
13086   if (!d->one_vector_p)
13087     return false;
13088
13089   diff = d->perm[0];
13090   switch (diff)
13091     {
13092     case 7:
13093       switch (d->vmode)
13094         {
13095         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13096         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13097         default:
13098           return false;
13099         }
13100       break;
13101     case 3:
13102       switch (d->vmode)
13103         {
13104         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13105         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13106         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13107         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13108         default:
13109           return false;
13110         }
13111       break;
13112     case 1:
13113       switch (d->vmode)
13114         {
13115         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13116         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13117         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13118         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13119         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
13120         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
13121         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13122         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13123         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13124         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13125         default:
13126           return false;
13127         }
13128       break;
13129     default:
13130       return false;
13131     }
13132
13133   for (i = 0; i < nelt ; i += diff + 1)
13134     for (j = 0; j <= diff; j += 1)
13135       {
13136         /* This is guaranteed to be true as the value of diff
13137            is 7, 3, 1 and we should have enough elements in the
13138            queue to generate this.  Getting a vector mask with a
13139            value of diff other than these values implies that
13140            something is wrong by the time we get here.  */
13141         gcc_assert (i + j < nelt);
13142         if (d->perm[i + j] != i + diff - j)
13143           return false;
13144       }
13145
13146   /* Success! */
13147   if (d->testing_p)
13148     return true;
13149
13150   emit_insn (gen (d->target, d->op0));
13151   return true;
13152 }
13153
13154 static bool
13155 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13156 {
13157   rtx (*gen) (rtx, rtx, rtx);
13158   rtx out = d->target;
13159   rtx in0;
13160   machine_mode vmode = d->vmode;
13161   unsigned int i, elt, nelt = d->nelt;
13162   rtx lane;
13163
13164   elt = d->perm[0];
13165   for (i = 1; i < nelt; i++)
13166     {
13167       if (elt != d->perm[i])
13168         return false;
13169     }
13170
13171   /* The generic preparation in aarch64_expand_vec_perm_const_1
13172      swaps the operand order and the permute indices if it finds
13173      d->perm[0] to be in the second operand.  Thus, we can always
13174      use d->op0 and need not do any extra arithmetic to get the
13175      correct lane number.  */
13176   in0 = d->op0;
13177   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13178
13179   switch (vmode)
13180     {
13181     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13182     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13183     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13184     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13185     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13186     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13187     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13188     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13189     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13190     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13191     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13192     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13193     default:
13194       return false;
13195     }
13196
13197   emit_insn (gen (out, in0, lane));
13198   return true;
13199 }
13200
13201 static bool
13202 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13203 {
13204   rtx rperm[MAX_VECT_LEN], sel;
13205   machine_mode vmode = d->vmode;
13206   unsigned int i, nelt = d->nelt;
13207
13208   if (d->testing_p)
13209     return true;
13210
13211   /* Generic code will try constant permutation twice.  Once with the
13212      original mode and again with the elements lowered to QImode.
13213      So wait and don't do the selector expansion ourselves.  */
13214   if (vmode != V8QImode && vmode != V16QImode)
13215     return false;
13216
13217   for (i = 0; i < nelt; ++i)
13218     {
13219       int nunits = GET_MODE_NUNITS (vmode);
13220
13221       /* If big-endian and two vectors we end up with a weird mixed-endian
13222          mode on NEON.  Reverse the index within each word but not the word
13223          itself.  */
13224       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13225                                            : d->perm[i]);
13226     }
13227   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13228   sel = force_reg (vmode, sel);
13229
13230   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13231   return true;
13232 }
13233
13234 static bool
13235 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13236 {
13237   /* The pattern matching functions above are written to look for a small
13238      number to begin the sequence (0, 1, N/2).  If we begin with an index
13239      from the second operand, we can swap the operands.  */
13240   if (d->perm[0] >= d->nelt)
13241     {
13242       unsigned i, nelt = d->nelt;
13243
13244       gcc_assert (nelt == (nelt & -nelt));
13245       for (i = 0; i < nelt; ++i)
13246         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13247
13248       std::swap (d->op0, d->op1);
13249     }
13250
13251   if (TARGET_SIMD)
13252     {
13253       if (aarch64_evpc_rev (d))
13254         return true;
13255       else if (aarch64_evpc_ext (d))
13256         return true;
13257       else if (aarch64_evpc_dup (d))
13258         return true;
13259       else if (aarch64_evpc_zip (d))
13260         return true;
13261       else if (aarch64_evpc_uzp (d))
13262         return true;
13263       else if (aarch64_evpc_trn (d))
13264         return true;
13265       return aarch64_evpc_tbl (d);
13266     }
13267   return false;
13268 }
13269
13270 /* Expand a vec_perm_const pattern.  */
13271
13272 bool
13273 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13274 {
13275   struct expand_vec_perm_d d;
13276   int i, nelt, which;
13277
13278   d.target = target;
13279   d.op0 = op0;
13280   d.op1 = op1;
13281
13282   d.vmode = GET_MODE (target);
13283   gcc_assert (VECTOR_MODE_P (d.vmode));
13284   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13285   d.testing_p = false;
13286
13287   for (i = which = 0; i < nelt; ++i)
13288     {
13289       rtx e = XVECEXP (sel, 0, i);
13290       int ei = INTVAL (e) & (2 * nelt - 1);
13291       which |= (ei < nelt ? 1 : 2);
13292       d.perm[i] = ei;
13293     }
13294
13295   switch (which)
13296     {
13297     default:
13298       gcc_unreachable ();
13299
13300     case 3:
13301       d.one_vector_p = false;
13302       if (!rtx_equal_p (op0, op1))
13303         break;
13304
13305       /* The elements of PERM do not suggest that only the first operand
13306          is used, but both operands are identical.  Allow easier matching
13307          of the permutation by folding the permutation into the single
13308          input vector.  */
13309       /* Fall Through.  */
13310     case 2:
13311       for (i = 0; i < nelt; ++i)
13312         d.perm[i] &= nelt - 1;
13313       d.op0 = op1;
13314       d.one_vector_p = true;
13315       break;
13316
13317     case 1:
13318       d.op1 = op0;
13319       d.one_vector_p = true;
13320       break;
13321     }
13322
13323   return aarch64_expand_vec_perm_const_1 (&d);
13324 }
13325
13326 static bool
13327 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13328                                      const unsigned char *sel)
13329 {
13330   struct expand_vec_perm_d d;
13331   unsigned int i, nelt, which;
13332   bool ret;
13333
13334   d.vmode = vmode;
13335   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13336   d.testing_p = true;
13337   memcpy (d.perm, sel, nelt);
13338
13339   /* Calculate whether all elements are in one vector.  */
13340   for (i = which = 0; i < nelt; ++i)
13341     {
13342       unsigned char e = d.perm[i];
13343       gcc_assert (e < 2 * nelt);
13344       which |= (e < nelt ? 1 : 2);
13345     }
13346
13347   /* If all elements are from the second vector, reindex as if from the
13348      first vector.  */
13349   if (which == 2)
13350     for (i = 0; i < nelt; ++i)
13351       d.perm[i] -= nelt;
13352
13353   /* Check whether the mask can be applied to a single vector.  */
13354   d.one_vector_p = (which != 3);
13355
13356   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13357   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13358   if (!d.one_vector_p)
13359     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13360
13361   start_sequence ();
13362   ret = aarch64_expand_vec_perm_const_1 (&d);
13363   end_sequence ();
13364
13365   return ret;
13366 }
13367
13368 rtx
13369 aarch64_reverse_mask (enum machine_mode mode)
13370 {
13371   /* We have to reverse each vector because we dont have
13372      a permuted load that can reverse-load according to ABI rules.  */
13373   rtx mask;
13374   rtvec v = rtvec_alloc (16);
13375   int i, j;
13376   int nunits = GET_MODE_NUNITS (mode);
13377   int usize = GET_MODE_UNIT_SIZE (mode);
13378
13379   gcc_assert (BYTES_BIG_ENDIAN);
13380   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13381
13382   for (i = 0; i < nunits; i++)
13383     for (j = 0; j < usize; j++)
13384       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13385   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13386   return force_reg (V16QImode, mask);
13387 }
13388
13389 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13390    However due to issues with register allocation it is preferable to avoid
13391    tieing integer scalar and FP scalar modes.  Executing integer operations
13392    in general registers is better than treating them as scalar vector
13393    operations.  This reduces latency and avoids redundant int<->FP moves.
13394    So tie modes if they are either the same class, or vector modes with
13395    other vector modes, vector structs or any scalar mode.
13396 */
13397
13398 bool
13399 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13400 {
13401   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13402     return true;
13403
13404   /* We specifically want to allow elements of "structure" modes to
13405      be tieable to the structure.  This more general condition allows
13406      other rarer situations too.  */
13407   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13408     return true;
13409
13410   /* Also allow any scalar modes with vectors.  */
13411   if (aarch64_vector_mode_supported_p (mode1)
13412       || aarch64_vector_mode_supported_p (mode2))
13413     return true;
13414
13415   return false;
13416 }
13417
13418 /* Return a new RTX holding the result of moving POINTER forward by
13419    AMOUNT bytes.  */
13420
13421 static rtx
13422 aarch64_move_pointer (rtx pointer, int amount)
13423 {
13424   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13425
13426   return adjust_automodify_address (pointer, GET_MODE (pointer),
13427                                     next, amount);
13428 }
13429
13430 /* Return a new RTX holding the result of moving POINTER forward by the
13431    size of the mode it points to.  */
13432
13433 static rtx
13434 aarch64_progress_pointer (rtx pointer)
13435 {
13436   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13437
13438   return aarch64_move_pointer (pointer, amount);
13439 }
13440
13441 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13442    MODE bytes.  */
13443
13444 static void
13445 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13446                                               machine_mode mode)
13447 {
13448   rtx reg = gen_reg_rtx (mode);
13449
13450   /* "Cast" the pointers to the correct mode.  */
13451   *src = adjust_address (*src, mode, 0);
13452   *dst = adjust_address (*dst, mode, 0);
13453   /* Emit the memcpy.  */
13454   emit_move_insn (reg, *src);
13455   emit_move_insn (*dst, reg);
13456   /* Move the pointers forward.  */
13457   *src = aarch64_progress_pointer (*src);
13458   *dst = aarch64_progress_pointer (*dst);
13459 }
13460
13461 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13462    we succeed, otherwise return false.  */
13463
13464 bool
13465 aarch64_expand_movmem (rtx *operands)
13466 {
13467   unsigned int n;
13468   rtx dst = operands[0];
13469   rtx src = operands[1];
13470   rtx base;
13471   bool speed_p = !optimize_function_for_size_p (cfun);
13472
13473   /* When optimizing for size, give a better estimate of the length of a
13474      memcpy call, but use the default otherwise.  */
13475   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13476
13477   /* We can't do anything smart if the amount to copy is not constant.  */
13478   if (!CONST_INT_P (operands[2]))
13479     return false;
13480
13481   n = UINTVAL (operands[2]);
13482
13483   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13484      need to make at most two moves.  For cases above 16 bytes it will be one
13485      move for each 16 byte chunk, then at most two additional moves.  */
13486   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13487     return false;
13488
13489   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13490   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13491
13492   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13493   src = adjust_automodify_address (src, VOIDmode, base, 0);
13494
13495   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13496      1-byte chunk.  */
13497   if (n < 4)
13498     {
13499       if (n >= 2)
13500         {
13501           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13502           n -= 2;
13503         }
13504
13505       if (n == 1)
13506         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13507
13508       return true;
13509     }
13510
13511   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13512      4-byte chunk, partially overlapping with the previously copied chunk.  */
13513   if (n < 8)
13514     {
13515       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13516       n -= 4;
13517       if (n > 0)
13518         {
13519           int move = n - 4;
13520
13521           src = aarch64_move_pointer (src, move);
13522           dst = aarch64_move_pointer (dst, move);
13523           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13524         }
13525       return true;
13526     }
13527
13528   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13529      them, then (if applicable) an 8-byte chunk.  */
13530   while (n >= 8)
13531     {
13532       if (n / 16)
13533         {
13534           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13535           n -= 16;
13536         }
13537       else
13538         {
13539           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13540           n -= 8;
13541         }
13542     }
13543
13544   /* Finish the final bytes of the copy.  We can always do this in one
13545      instruction.  We either copy the exact amount we need, or partially
13546      overlap with the previous chunk we copied and copy 8-bytes.  */
13547   if (n == 0)
13548     return true;
13549   else if (n == 1)
13550     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13551   else if (n == 2)
13552     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13553   else if (n == 4)
13554     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13555   else
13556     {
13557       if (n == 3)
13558         {
13559           src = aarch64_move_pointer (src, -1);
13560           dst = aarch64_move_pointer (dst, -1);
13561           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13562         }
13563       else
13564         {
13565           int move = n - 8;
13566
13567           src = aarch64_move_pointer (src, move);
13568           dst = aarch64_move_pointer (dst, move);
13569           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13570         }
13571     }
13572
13573   return true;
13574 }
13575
13576 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13577    SImode stores.  Handle the case when the constant has identical
13578    bottom and top halves.  This is beneficial when the two stores can be
13579    merged into an STP and we avoid synthesising potentially expensive
13580    immediates twice.  Return true if such a split is possible.  */
13581
13582 bool
13583 aarch64_split_dimode_const_store (rtx dst, rtx src)
13584 {
13585   rtx lo = gen_lowpart (SImode, src);
13586   rtx hi = gen_highpart_mode (SImode, DImode, src);
13587
13588   bool size_p = optimize_function_for_size_p (cfun);
13589
13590   if (!rtx_equal_p (lo, hi))
13591     return false;
13592
13593   unsigned int orig_cost
13594     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13595   unsigned int lo_cost
13596     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13597
13598   /* We want to transform:
13599      MOV        x1, 49370
13600      MOVK       x1, 0x140, lsl 16
13601      MOVK       x1, 0xc0da, lsl 32
13602      MOVK       x1, 0x140, lsl 48
13603      STR        x1, [x0]
13604    into:
13605      MOV        w1, 49370
13606      MOVK       w1, 0x140, lsl 16
13607      STP        w1, w1, [x0]
13608    So we want to perform this only when we save two instructions
13609    or more.  When optimizing for size, however, accept any code size
13610    savings we can.  */
13611   if (size_p && orig_cost <= lo_cost)
13612     return false;
13613
13614   if (!size_p
13615       && (orig_cost <= lo_cost + 1))
13616     return false;
13617
13618   rtx mem_lo = adjust_address (dst, SImode, 0);
13619   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13620     return false;
13621
13622   rtx tmp_reg = gen_reg_rtx (SImode);
13623   aarch64_expand_mov_immediate (tmp_reg, lo);
13624   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13625   /* Don't emit an explicit store pair as this may not be always profitable.
13626      Let the sched-fusion logic decide whether to merge them.  */
13627   emit_move_insn (mem_lo, tmp_reg);
13628   emit_move_insn (mem_hi, tmp_reg);
13629
13630   return true;
13631 }
13632
13633 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13634
13635 static unsigned HOST_WIDE_INT
13636 aarch64_asan_shadow_offset (void)
13637 {
13638   return (HOST_WIDE_INT_1 << 36);
13639 }
13640
13641 static bool
13642 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13643                                         unsigned int align,
13644                                         enum by_pieces_operation op,
13645                                         bool speed_p)
13646 {
13647   /* STORE_BY_PIECES can be used when copying a constant string, but
13648      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13649      For now we always fail this and let the move_by_pieces code copy
13650      the string from read-only memory.  */
13651   if (op == STORE_BY_PIECES)
13652     return false;
13653
13654   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13655 }
13656
13657 static rtx
13658 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13659                         int code, tree treeop0, tree treeop1)
13660 {
13661   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13662   rtx op0, op1;
13663   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13664   insn_code icode;
13665   struct expand_operand ops[4];
13666
13667   start_sequence ();
13668   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13669
13670   op_mode = GET_MODE (op0);
13671   if (op_mode == VOIDmode)
13672     op_mode = GET_MODE (op1);
13673
13674   switch (op_mode)
13675     {
13676     case QImode:
13677     case HImode:
13678     case SImode:
13679       cmp_mode = SImode;
13680       icode = CODE_FOR_cmpsi;
13681       break;
13682
13683     case DImode:
13684       cmp_mode = DImode;
13685       icode = CODE_FOR_cmpdi;
13686       break;
13687
13688     case SFmode:
13689       cmp_mode = SFmode;
13690       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13691       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13692       break;
13693
13694     case DFmode:
13695       cmp_mode = DFmode;
13696       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13697       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13698       break;
13699
13700     default:
13701       end_sequence ();
13702       return NULL_RTX;
13703     }
13704
13705   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13706   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13707   if (!op0 || !op1)
13708     {
13709       end_sequence ();
13710       return NULL_RTX;
13711     }
13712   *prep_seq = get_insns ();
13713   end_sequence ();
13714
13715   create_fixed_operand (&ops[0], op0);
13716   create_fixed_operand (&ops[1], op1);
13717
13718   start_sequence ();
13719   if (!maybe_expand_insn (icode, 2, ops))
13720     {
13721       end_sequence ();
13722       return NULL_RTX;
13723     }
13724   *gen_seq = get_insns ();
13725   end_sequence ();
13726
13727   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13728                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13729 }
13730
13731 static rtx
13732 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13733                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
13734 {
13735   rtx op0, op1, target;
13736   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13737   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13738   insn_code icode;
13739   struct expand_operand ops[6];
13740   int aarch64_cond;
13741
13742   push_to_sequence (*prep_seq);
13743   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13744
13745   op_mode = GET_MODE (op0);
13746   if (op_mode == VOIDmode)
13747     op_mode = GET_MODE (op1);
13748
13749   switch (op_mode)
13750     {
13751     case QImode:
13752     case HImode:
13753     case SImode:
13754       cmp_mode = SImode;
13755       icode = CODE_FOR_ccmpsi;
13756       break;
13757
13758     case DImode:
13759       cmp_mode = DImode;
13760       icode = CODE_FOR_ccmpdi;
13761       break;
13762
13763     case SFmode:
13764       cmp_mode = SFmode;
13765       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13766       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13767       break;
13768
13769     case DFmode:
13770       cmp_mode = DFmode;
13771       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13772       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13773       break;
13774
13775     default:
13776       end_sequence ();
13777       return NULL_RTX;
13778     }
13779
13780   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13781   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13782   if (!op0 || !op1)
13783     {
13784       end_sequence ();
13785       return NULL_RTX;
13786     }
13787   *prep_seq = get_insns ();
13788   end_sequence ();
13789
13790   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13791   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13792
13793   if (bit_code != AND)
13794     {
13795       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13796                                                 GET_MODE (XEXP (prev, 0))),
13797                              VOIDmode, XEXP (prev, 0), const0_rtx);
13798       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13799     }
13800
13801   create_fixed_operand (&ops[0], XEXP (prev, 0));
13802   create_fixed_operand (&ops[1], target);
13803   create_fixed_operand (&ops[2], op0);
13804   create_fixed_operand (&ops[3], op1);
13805   create_fixed_operand (&ops[4], prev);
13806   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13807
13808   push_to_sequence (*gen_seq);
13809   if (!maybe_expand_insn (icode, 6, ops))
13810     {
13811       end_sequence ();
13812       return NULL_RTX;
13813     }
13814
13815   *gen_seq = get_insns ();
13816   end_sequence ();
13817
13818   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13819 }
13820
13821 #undef TARGET_GEN_CCMP_FIRST
13822 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13823
13824 #undef TARGET_GEN_CCMP_NEXT
13825 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13826
13827 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13828    instruction fusion of some sort.  */
13829
13830 static bool
13831 aarch64_macro_fusion_p (void)
13832 {
13833   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13834 }
13835
13836
13837 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13838    should be kept together during scheduling.  */
13839
13840 static bool
13841 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13842 {
13843   rtx set_dest;
13844   rtx prev_set = single_set (prev);
13845   rtx curr_set = single_set (curr);
13846   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13847   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13848
13849   if (!aarch64_macro_fusion_p ())
13850     return false;
13851
13852   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13853     {
13854       /* We are trying to match:
13855          prev (mov)  == (set (reg r0) (const_int imm16))
13856          curr (movk) == (set (zero_extract (reg r0)
13857                                            (const_int 16)
13858                                            (const_int 16))
13859                              (const_int imm16_1))  */
13860
13861       set_dest = SET_DEST (curr_set);
13862
13863       if (GET_CODE (set_dest) == ZERO_EXTRACT
13864           && CONST_INT_P (SET_SRC (curr_set))
13865           && CONST_INT_P (SET_SRC (prev_set))
13866           && CONST_INT_P (XEXP (set_dest, 2))
13867           && INTVAL (XEXP (set_dest, 2)) == 16
13868           && REG_P (XEXP (set_dest, 0))
13869           && REG_P (SET_DEST (prev_set))
13870           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13871         {
13872           return true;
13873         }
13874     }
13875
13876   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13877     {
13878
13879       /*  We're trying to match:
13880           prev (adrp) == (set (reg r1)
13881                               (high (symbol_ref ("SYM"))))
13882           curr (add) == (set (reg r0)
13883                              (lo_sum (reg r1)
13884                                      (symbol_ref ("SYM"))))
13885           Note that r0 need not necessarily be the same as r1, especially
13886           during pre-regalloc scheduling.  */
13887
13888       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13889           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13890         {
13891           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13892               && REG_P (XEXP (SET_SRC (curr_set), 0))
13893               && REGNO (XEXP (SET_SRC (curr_set), 0))
13894                  == REGNO (SET_DEST (prev_set))
13895               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13896                               XEXP (SET_SRC (curr_set), 1)))
13897             return true;
13898         }
13899     }
13900
13901   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13902     {
13903
13904       /* We're trying to match:
13905          prev (movk) == (set (zero_extract (reg r0)
13906                                            (const_int 16)
13907                                            (const_int 32))
13908                              (const_int imm16_1))
13909          curr (movk) == (set (zero_extract (reg r0)
13910                                            (const_int 16)
13911                                            (const_int 48))
13912                              (const_int imm16_2))  */
13913
13914       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13915           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13916           && REG_P (XEXP (SET_DEST (prev_set), 0))
13917           && REG_P (XEXP (SET_DEST (curr_set), 0))
13918           && REGNO (XEXP (SET_DEST (prev_set), 0))
13919              == REGNO (XEXP (SET_DEST (curr_set), 0))
13920           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13921           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13922           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13923           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13924           && CONST_INT_P (SET_SRC (prev_set))
13925           && CONST_INT_P (SET_SRC (curr_set)))
13926         return true;
13927
13928     }
13929   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13930     {
13931       /* We're trying to match:
13932           prev (adrp) == (set (reg r0)
13933                               (high (symbol_ref ("SYM"))))
13934           curr (ldr) == (set (reg r1)
13935                              (mem (lo_sum (reg r0)
13936                                              (symbol_ref ("SYM")))))
13937                  or
13938           curr (ldr) == (set (reg r1)
13939                              (zero_extend (mem
13940                                            (lo_sum (reg r0)
13941                                                    (symbol_ref ("SYM"))))))  */
13942       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13943           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13944         {
13945           rtx curr_src = SET_SRC (curr_set);
13946
13947           if (GET_CODE (curr_src) == ZERO_EXTEND)
13948             curr_src = XEXP (curr_src, 0);
13949
13950           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13951               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13952               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13953                  == REGNO (SET_DEST (prev_set))
13954               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13955                               XEXP (SET_SRC (prev_set), 0)))
13956               return true;
13957         }
13958     }
13959
13960   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13961        && aarch_crypto_can_dual_issue (prev, curr))
13962     return true;
13963
13964   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13965       && any_condjump_p (curr))
13966     {
13967       enum attr_type prev_type = get_attr_type (prev);
13968
13969       /* FIXME: this misses some which is considered simple arthematic
13970          instructions for ThunderX.  Simple shifts are missed here.  */
13971       if (prev_type == TYPE_ALUS_SREG
13972           || prev_type == TYPE_ALUS_IMM
13973           || prev_type == TYPE_LOGICS_REG
13974           || prev_type == TYPE_LOGICS_IMM)
13975         return true;
13976     }
13977
13978   return false;
13979 }
13980
13981 /* Return true iff the instruction fusion described by OP is enabled.  */
13982
13983 bool
13984 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13985 {
13986   return (aarch64_tune_params.fusible_ops & op) != 0;
13987 }
13988
13989 /* If MEM is in the form of [base+offset], extract the two parts
13990    of address and set to BASE and OFFSET, otherwise return false
13991    after clearing BASE and OFFSET.  */
13992
13993 bool
13994 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13995 {
13996   rtx addr;
13997
13998   gcc_assert (MEM_P (mem));
13999
14000   addr = XEXP (mem, 0);
14001
14002   if (REG_P (addr))
14003     {
14004       *base = addr;
14005       *offset = const0_rtx;
14006       return true;
14007     }
14008
14009   if (GET_CODE (addr) == PLUS
14010       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14011     {
14012       *base = XEXP (addr, 0);
14013       *offset = XEXP (addr, 1);
14014       return true;
14015     }
14016
14017   *base = NULL_RTX;
14018   *offset = NULL_RTX;
14019
14020   return false;
14021 }
14022
14023 /* Types for scheduling fusion.  */
14024 enum sched_fusion_type
14025 {
14026   SCHED_FUSION_NONE = 0,
14027   SCHED_FUSION_LD_SIGN_EXTEND,
14028   SCHED_FUSION_LD_ZERO_EXTEND,
14029   SCHED_FUSION_LD,
14030   SCHED_FUSION_ST,
14031   SCHED_FUSION_NUM
14032 };
14033
14034 /* If INSN is a load or store of address in the form of [base+offset],
14035    extract the two parts and set to BASE and OFFSET.  Return scheduling
14036    fusion type this INSN is.  */
14037
14038 static enum sched_fusion_type
14039 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14040 {
14041   rtx x, dest, src;
14042   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14043
14044   gcc_assert (INSN_P (insn));
14045   x = PATTERN (insn);
14046   if (GET_CODE (x) != SET)
14047     return SCHED_FUSION_NONE;
14048
14049   src = SET_SRC (x);
14050   dest = SET_DEST (x);
14051
14052   machine_mode dest_mode = GET_MODE (dest);
14053
14054   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14055     return SCHED_FUSION_NONE;
14056
14057   if (GET_CODE (src) == SIGN_EXTEND)
14058     {
14059       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14060       src = XEXP (src, 0);
14061       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14062         return SCHED_FUSION_NONE;
14063     }
14064   else if (GET_CODE (src) == ZERO_EXTEND)
14065     {
14066       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14067       src = XEXP (src, 0);
14068       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14069         return SCHED_FUSION_NONE;
14070     }
14071
14072   if (GET_CODE (src) == MEM && REG_P (dest))
14073     extract_base_offset_in_addr (src, base, offset);
14074   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14075     {
14076       fusion = SCHED_FUSION_ST;
14077       extract_base_offset_in_addr (dest, base, offset);
14078     }
14079   else
14080     return SCHED_FUSION_NONE;
14081
14082   if (*base == NULL_RTX || *offset == NULL_RTX)
14083     fusion = SCHED_FUSION_NONE;
14084
14085   return fusion;
14086 }
14087
14088 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14089
14090    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14091    and PRI are only calculated for these instructions.  For other instruction,
14092    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14093    type instruction fusion can be added by returning different priorities.
14094
14095    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14096
14097 static void
14098 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14099                                int *fusion_pri, int *pri)
14100 {
14101   int tmp, off_val;
14102   rtx base, offset;
14103   enum sched_fusion_type fusion;
14104
14105   gcc_assert (INSN_P (insn));
14106
14107   tmp = max_pri - 1;
14108   fusion = fusion_load_store (insn, &base, &offset);
14109   if (fusion == SCHED_FUSION_NONE)
14110     {
14111       *pri = tmp;
14112       *fusion_pri = tmp;
14113       return;
14114     }
14115
14116   /* Set FUSION_PRI according to fusion type and base register.  */
14117   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14118
14119   /* Calculate PRI.  */
14120   tmp /= 2;
14121
14122   /* INSN with smaller offset goes first.  */
14123   off_val = (int)(INTVAL (offset));
14124   if (off_val >= 0)
14125     tmp -= (off_val & 0xfffff);
14126   else
14127     tmp += ((- off_val) & 0xfffff);
14128
14129   *pri = tmp;
14130   return;
14131 }
14132
14133 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14134    Adjust priority of sha1h instructions so they are scheduled before
14135    other SHA1 instructions.  */
14136
14137 static int
14138 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14139 {
14140   rtx x = PATTERN (insn);
14141
14142   if (GET_CODE (x) == SET)
14143     {
14144       x = SET_SRC (x);
14145
14146       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14147         return priority + 10;
14148     }
14149
14150   return priority;
14151 }
14152
14153 /* Given OPERANDS of consecutive load/store, check if we can merge
14154    them into ldp/stp.  LOAD is true if they are load instructions.
14155    MODE is the mode of memory operands.  */
14156
14157 bool
14158 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14159                                 enum machine_mode mode)
14160 {
14161   HOST_WIDE_INT offval_1, offval_2, msize;
14162   enum reg_class rclass_1, rclass_2;
14163   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14164
14165   if (load)
14166     {
14167       mem_1 = operands[1];
14168       mem_2 = operands[3];
14169       reg_1 = operands[0];
14170       reg_2 = operands[2];
14171       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14172       if (REGNO (reg_1) == REGNO (reg_2))
14173         return false;
14174     }
14175   else
14176     {
14177       mem_1 = operands[0];
14178       mem_2 = operands[2];
14179       reg_1 = operands[1];
14180       reg_2 = operands[3];
14181     }
14182
14183   /* The mems cannot be volatile.  */
14184   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14185     return false;
14186
14187   /* If we have SImode and slow unaligned ldp,
14188      check the alignment to be at least 8 byte. */
14189   if (mode == SImode
14190       && (aarch64_tune_params.extra_tuning_flags
14191           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14192       && !optimize_size
14193       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14194     return false;
14195
14196   /* Check if the addresses are in the form of [base+offset].  */
14197   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14198   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14199     return false;
14200   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14201   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14202     return false;
14203
14204   /* Check if the bases are same.  */
14205   if (!rtx_equal_p (base_1, base_2))
14206     return false;
14207
14208   offval_1 = INTVAL (offset_1);
14209   offval_2 = INTVAL (offset_2);
14210   msize = GET_MODE_SIZE (mode);
14211   /* Check if the offsets are consecutive.  */
14212   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14213     return false;
14214
14215   /* Check if the addresses are clobbered by load.  */
14216   if (load)
14217     {
14218       if (reg_mentioned_p (reg_1, mem_1))
14219         return false;
14220
14221       /* In increasing order, the last load can clobber the address.  */
14222       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14223       return false;
14224     }
14225
14226   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14227     rclass_1 = FP_REGS;
14228   else
14229     rclass_1 = GENERAL_REGS;
14230
14231   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14232     rclass_2 = FP_REGS;
14233   else
14234     rclass_2 = GENERAL_REGS;
14235
14236   /* Check if the registers are of same class.  */
14237   if (rclass_1 != rclass_2)
14238     return false;
14239
14240   return true;
14241 }
14242
14243 /* Given OPERANDS of consecutive load/store, check if we can merge
14244    them into ldp/stp by adjusting the offset.  LOAD is true if they
14245    are load instructions.  MODE is the mode of memory operands.
14246
14247    Given below consecutive stores:
14248
14249      str  w1, [xb, 0x100]
14250      str  w1, [xb, 0x104]
14251      str  w1, [xb, 0x108]
14252      str  w1, [xb, 0x10c]
14253
14254    Though the offsets are out of the range supported by stp, we can
14255    still pair them after adjusting the offset, like:
14256
14257      add  scratch, xb, 0x100
14258      stp  w1, w1, [scratch]
14259      stp  w1, w1, [scratch, 0x8]
14260
14261    The peephole patterns detecting this opportunity should guarantee
14262    the scratch register is avaliable.  */
14263
14264 bool
14265 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14266                                        enum machine_mode mode)
14267 {
14268   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14269   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14270   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14271   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14272
14273   if (load)
14274     {
14275       reg_1 = operands[0];
14276       mem_1 = operands[1];
14277       reg_2 = operands[2];
14278       mem_2 = operands[3];
14279       reg_3 = operands[4];
14280       mem_3 = operands[5];
14281       reg_4 = operands[6];
14282       mem_4 = operands[7];
14283       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14284                   && REG_P (reg_3) && REG_P (reg_4));
14285       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14286         return false;
14287     }
14288   else
14289     {
14290       mem_1 = operands[0];
14291       reg_1 = operands[1];
14292       mem_2 = operands[2];
14293       reg_2 = operands[3];
14294       mem_3 = operands[4];
14295       reg_3 = operands[5];
14296       mem_4 = operands[6];
14297       reg_4 = operands[7];
14298     }
14299   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14300   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14301     return false;
14302
14303   /* The mems cannot be volatile.  */
14304   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14305       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14306     return false;
14307
14308   /* Check if the addresses are in the form of [base+offset].  */
14309   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14310   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14311     return false;
14312   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14313   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14314     return false;
14315   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14316   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14317     return false;
14318   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14319   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14320     return false;
14321
14322   /* Check if the bases are same.  */
14323   if (!rtx_equal_p (base_1, base_2)
14324       || !rtx_equal_p (base_2, base_3)
14325       || !rtx_equal_p (base_3, base_4))
14326     return false;
14327
14328   offval_1 = INTVAL (offset_1);
14329   offval_2 = INTVAL (offset_2);
14330   offval_3 = INTVAL (offset_3);
14331   offval_4 = INTVAL (offset_4);
14332   msize = GET_MODE_SIZE (mode);
14333   /* Check if the offsets are consecutive.  */
14334   if ((offval_1 != (offval_2 + msize)
14335        || offval_1 != (offval_3 + msize * 2)
14336        || offval_1 != (offval_4 + msize * 3))
14337       && (offval_4 != (offval_3 + msize)
14338           || offval_4 != (offval_2 + msize * 2)
14339           || offval_4 != (offval_1 + msize * 3)))
14340     return false;
14341
14342   /* Check if the addresses are clobbered by load.  */
14343   if (load)
14344     {
14345       if (reg_mentioned_p (reg_1, mem_1)
14346           || reg_mentioned_p (reg_2, mem_2)
14347           || reg_mentioned_p (reg_3, mem_3))
14348         return false;
14349
14350       /* In increasing order, the last load can clobber the address.  */
14351       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14352         return false;
14353     }
14354
14355   /* If we have SImode and slow unaligned ldp,
14356      check the alignment to be at least 8 byte. */
14357   if (mode == SImode
14358       && (aarch64_tune_params.extra_tuning_flags
14359           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14360       && !optimize_size
14361       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14362     return false;
14363
14364   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14365     rclass_1 = FP_REGS;
14366   else
14367     rclass_1 = GENERAL_REGS;
14368
14369   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14370     rclass_2 = FP_REGS;
14371   else
14372     rclass_2 = GENERAL_REGS;
14373
14374   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14375     rclass_3 = FP_REGS;
14376   else
14377     rclass_3 = GENERAL_REGS;
14378
14379   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14380     rclass_4 = FP_REGS;
14381   else
14382     rclass_4 = GENERAL_REGS;
14383
14384   /* Check if the registers are of same class.  */
14385   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14386     return false;
14387
14388   return true;
14389 }
14390
14391 /* Given OPERANDS of consecutive load/store, this function pairs them
14392    into ldp/stp after adjusting the offset.  It depends on the fact
14393    that addresses of load/store instructions are in increasing order.
14394    MODE is the mode of memory operands.  CODE is the rtl operator
14395    which should be applied to all memory operands, it's SIGN_EXTEND,
14396    ZERO_EXTEND or UNKNOWN.  */
14397
14398 bool
14399 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14400                              enum machine_mode mode, RTX_CODE code)
14401 {
14402   rtx base, offset, t1, t2;
14403   rtx mem_1, mem_2, mem_3, mem_4;
14404   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14405
14406   if (load)
14407     {
14408       mem_1 = operands[1];
14409       mem_2 = operands[3];
14410       mem_3 = operands[5];
14411       mem_4 = operands[7];
14412     }
14413   else
14414     {
14415       mem_1 = operands[0];
14416       mem_2 = operands[2];
14417       mem_3 = operands[4];
14418       mem_4 = operands[6];
14419       gcc_assert (code == UNKNOWN);
14420     }
14421
14422   extract_base_offset_in_addr (mem_1, &base, &offset);
14423   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14424
14425   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14426   msize = GET_MODE_SIZE (mode);
14427   stp_off_limit = msize * 0x40;
14428   off_val = INTVAL (offset);
14429   abs_off = (off_val < 0) ? -off_val : off_val;
14430   new_off = abs_off % stp_off_limit;
14431   adj_off = abs_off - new_off;
14432
14433   /* Further adjust to make sure all offsets are OK.  */
14434   if ((new_off + msize * 2) >= stp_off_limit)
14435     {
14436       adj_off += stp_off_limit;
14437       new_off -= stp_off_limit;
14438     }
14439
14440   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14441   if (adj_off >= 0x1000)
14442     return false;
14443
14444   if (off_val < 0)
14445     {
14446       adj_off = -adj_off;
14447       new_off = -new_off;
14448     }
14449
14450   /* Create new memory references.  */
14451   mem_1 = change_address (mem_1, VOIDmode,
14452                           plus_constant (DImode, operands[8], new_off));
14453
14454   /* Check if the adjusted address is OK for ldp/stp.  */
14455   if (!aarch64_mem_pair_operand (mem_1, mode))
14456     return false;
14457
14458   msize = GET_MODE_SIZE (mode);
14459   mem_2 = change_address (mem_2, VOIDmode,
14460                           plus_constant (DImode,
14461                                          operands[8],
14462                                          new_off + msize));
14463   mem_3 = change_address (mem_3, VOIDmode,
14464                           plus_constant (DImode,
14465                                          operands[8],
14466                                          new_off + msize * 2));
14467   mem_4 = change_address (mem_4, VOIDmode,
14468                           plus_constant (DImode,
14469                                          operands[8],
14470                                          new_off + msize * 3));
14471
14472   if (code == ZERO_EXTEND)
14473     {
14474       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14475       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14476       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14477       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14478     }
14479   else if (code == SIGN_EXTEND)
14480     {
14481       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14482       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14483       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14484       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14485     }
14486
14487   if (load)
14488     {
14489       operands[1] = mem_1;
14490       operands[3] = mem_2;
14491       operands[5] = mem_3;
14492       operands[7] = mem_4;
14493     }
14494   else
14495     {
14496       operands[0] = mem_1;
14497       operands[2] = mem_2;
14498       operands[4] = mem_3;
14499       operands[6] = mem_4;
14500     }
14501
14502   /* Emit adjusting instruction.  */
14503   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14504   /* Emit ldp/stp instructions.  */
14505   t1 = gen_rtx_SET (operands[0], operands[1]);
14506   t2 = gen_rtx_SET (operands[2], operands[3]);
14507   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14508   t1 = gen_rtx_SET (operands[4], operands[5]);
14509   t2 = gen_rtx_SET (operands[6], operands[7]);
14510   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14511   return true;
14512 }
14513
14514 /* Return 1 if pseudo register should be created and used to hold
14515    GOT address for PIC code.  */
14516
14517 bool
14518 aarch64_use_pseudo_pic_reg (void)
14519 {
14520   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14521 }
14522
14523 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14524
14525 static int
14526 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14527 {
14528   switch (XINT (x, 1))
14529     {
14530     case UNSPEC_GOTSMALLPIC:
14531     case UNSPEC_GOTSMALLPIC28K:
14532     case UNSPEC_GOTTINYPIC:
14533       return 0;
14534     default:
14535       break;
14536     }
14537
14538   return default_unspec_may_trap_p (x, flags);
14539 }
14540
14541
14542 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14543    return the log2 of that value.  Otherwise return -1.  */
14544
14545 int
14546 aarch64_fpconst_pow_of_2 (rtx x)
14547 {
14548   const REAL_VALUE_TYPE *r;
14549
14550   if (!CONST_DOUBLE_P (x))
14551     return -1;
14552
14553   r = CONST_DOUBLE_REAL_VALUE (x);
14554
14555   if (REAL_VALUE_NEGATIVE (*r)
14556       || REAL_VALUE_ISNAN (*r)
14557       || REAL_VALUE_ISINF (*r)
14558       || !real_isinteger (r, DFmode))
14559     return -1;
14560
14561   return exact_log2 (real_to_integer (r));
14562 }
14563
14564 /* If X is a vector of equal CONST_DOUBLE values and that value is
14565    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14566
14567 int
14568 aarch64_vec_fpconst_pow_of_2 (rtx x)
14569 {
14570   if (GET_CODE (x) != CONST_VECTOR)
14571     return -1;
14572
14573   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14574     return -1;
14575
14576   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14577   if (firstval <= 0)
14578     return -1;
14579
14580   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14581     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14582       return -1;
14583
14584   return firstval;
14585 }
14586
14587 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14588    to float.
14589
14590    __fp16 always promotes through this hook.
14591    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14592    through the generic excess precision logic rather than here.  */
14593
14594 static tree
14595 aarch64_promoted_type (const_tree t)
14596 {
14597   if (SCALAR_FLOAT_TYPE_P (t)
14598       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14599     return float_type_node;
14600
14601   return NULL_TREE;
14602 }
14603
14604 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14605
14606 static bool
14607 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14608                            optimization_type opt_type)
14609 {
14610   switch (op)
14611     {
14612     case rsqrt_optab:
14613       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14614
14615     default:
14616       return true;
14617     }
14618 }
14619
14620 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14621    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14622
14623 static bool
14624 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14625 {
14626   return (mode == HFmode
14627           ? true
14628           : default_libgcc_floating_mode_supported_p (mode));
14629 }
14630
14631 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14632    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14633
14634 static bool
14635 aarch64_scalar_mode_supported_p (machine_mode mode)
14636 {
14637   return (mode == HFmode
14638           ? true
14639           : default_scalar_mode_supported_p (mode));
14640 }
14641
14642 /* Set the value of FLT_EVAL_METHOD.
14643    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14644
14645     0: evaluate all operations and constants, whose semantic type has at
14646        most the range and precision of type float, to the range and
14647        precision of float; evaluate all other operations and constants to
14648        the range and precision of the semantic type;
14649
14650     N, where _FloatN is a supported interchange floating type
14651        evaluate all operations and constants, whose semantic type has at
14652        most the range and precision of _FloatN type, to the range and
14653        precision of the _FloatN type; evaluate all other operations and
14654        constants to the range and precision of the semantic type;
14655
14656    If we have the ARMv8.2-A extensions then we support _Float16 in native
14657    precision, so we should set this to 16.  Otherwise, we support the type,
14658    but want to evaluate expressions in float precision, so set this to
14659    0.  */
14660
14661 static enum flt_eval_method
14662 aarch64_excess_precision (enum excess_precision_type type)
14663 {
14664   switch (type)
14665     {
14666       case EXCESS_PRECISION_TYPE_FAST:
14667       case EXCESS_PRECISION_TYPE_STANDARD:
14668         /* We can calculate either in 16-bit range and precision or
14669            32-bit range and precision.  Make that decision based on whether
14670            we have native support for the ARMv8.2-A 16-bit floating-point
14671            instructions or not.  */
14672         return (TARGET_FP_F16INST
14673                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14674                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14675       case EXCESS_PRECISION_TYPE_IMPLICIT:
14676         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14677       default:
14678         gcc_unreachable ();
14679     }
14680   return FLT_EVAL_METHOD_UNPREDICTABLE;
14681 }
14682
14683 /* Target-specific selftests.  */
14684
14685 #if CHECKING_P
14686
14687 namespace selftest {
14688
14689 /* Selftest for the RTL loader.
14690    Verify that the RTL loader copes with a dump from
14691    print_rtx_function.  This is essentially just a test that class
14692    function_reader can handle a real dump, but it also verifies
14693    that lookup_reg_by_dump_name correctly handles hard regs.
14694    The presence of hard reg names in the dump means that the test is
14695    target-specific, hence it is in this file.  */
14696
14697 static void
14698 aarch64_test_loading_full_dump ()
14699 {
14700   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14701
14702   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14703
14704   rtx_insn *insn_1 = get_insn_by_uid (1);
14705   ASSERT_EQ (NOTE, GET_CODE (insn_1));
14706
14707   rtx_insn *insn_15 = get_insn_by_uid (15);
14708   ASSERT_EQ (INSN, GET_CODE (insn_15));
14709   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14710
14711   /* Verify crtl->return_rtx.  */
14712   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14713   ASSERT_EQ (0, REGNO (crtl->return_rtx));
14714   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14715 }
14716
14717 /* Run all target-specific selftests.  */
14718
14719 static void
14720 aarch64_run_selftests (void)
14721 {
14722   aarch64_test_loading_full_dump ();
14723 }
14724
14725 } // namespace selftest
14726
14727 #endif /* #if CHECKING_P */
14728
14729 #undef TARGET_ADDRESS_COST
14730 #define TARGET_ADDRESS_COST aarch64_address_cost
14731
14732 /* This hook will determines whether unnamed bitfields affect the alignment
14733    of the containing structure.  The hook returns true if the structure
14734    should inherit the alignment requirements of an unnamed bitfield's
14735    type.  */
14736 #undef TARGET_ALIGN_ANON_BITFIELD
14737 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14738
14739 #undef TARGET_ASM_ALIGNED_DI_OP
14740 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14741
14742 #undef TARGET_ASM_ALIGNED_HI_OP
14743 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14744
14745 #undef TARGET_ASM_ALIGNED_SI_OP
14746 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14747
14748 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14749 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14750   hook_bool_const_tree_hwi_hwi_const_tree_true
14751
14752 #undef TARGET_ASM_FILE_START
14753 #define TARGET_ASM_FILE_START aarch64_start_file
14754
14755 #undef TARGET_ASM_OUTPUT_MI_THUNK
14756 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14757
14758 #undef TARGET_ASM_SELECT_RTX_SECTION
14759 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14760
14761 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14762 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14763
14764 #undef TARGET_BUILD_BUILTIN_VA_LIST
14765 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14766
14767 #undef TARGET_CALLEE_COPIES
14768 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14769
14770 #undef TARGET_CAN_ELIMINATE
14771 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14772
14773 #undef TARGET_CAN_INLINE_P
14774 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14775
14776 #undef TARGET_CANNOT_FORCE_CONST_MEM
14777 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14778
14779 #undef TARGET_CASE_VALUES_THRESHOLD
14780 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14781
14782 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14783 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14784
14785 /* Only the least significant bit is used for initialization guard
14786    variables.  */
14787 #undef TARGET_CXX_GUARD_MASK_BIT
14788 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14789
14790 #undef TARGET_C_MODE_FOR_SUFFIX
14791 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14792
14793 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14794 #undef  TARGET_DEFAULT_TARGET_FLAGS
14795 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14796 #endif
14797
14798 #undef TARGET_CLASS_MAX_NREGS
14799 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14800
14801 #undef TARGET_BUILTIN_DECL
14802 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14803
14804 #undef TARGET_BUILTIN_RECIPROCAL
14805 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14806
14807 #undef TARGET_C_EXCESS_PRECISION
14808 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14809
14810 #undef  TARGET_EXPAND_BUILTIN
14811 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14812
14813 #undef TARGET_EXPAND_BUILTIN_VA_START
14814 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14815
14816 #undef TARGET_FOLD_BUILTIN
14817 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14818
14819 #undef TARGET_FUNCTION_ARG
14820 #define TARGET_FUNCTION_ARG aarch64_function_arg
14821
14822 #undef TARGET_FUNCTION_ARG_ADVANCE
14823 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14824
14825 #undef TARGET_FUNCTION_ARG_BOUNDARY
14826 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14827
14828 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14829 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14830
14831 #undef TARGET_FUNCTION_VALUE
14832 #define TARGET_FUNCTION_VALUE aarch64_function_value
14833
14834 #undef TARGET_FUNCTION_VALUE_REGNO_P
14835 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14836
14837 #undef TARGET_FRAME_POINTER_REQUIRED
14838 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14839
14840 #undef TARGET_GIMPLE_FOLD_BUILTIN
14841 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14842
14843 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14844 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14845
14846 #undef  TARGET_INIT_BUILTINS
14847 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14848
14849 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14850 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14851   aarch64_ira_change_pseudo_allocno_class
14852
14853 #undef TARGET_LEGITIMATE_ADDRESS_P
14854 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14855
14856 #undef TARGET_LEGITIMATE_CONSTANT_P
14857 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14858
14859 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14860 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14861   aarch64_legitimize_address_displacement
14862
14863 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14864 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14865
14866 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14867 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14868 aarch64_libgcc_floating_mode_supported_p
14869
14870 #undef TARGET_MANGLE_TYPE
14871 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14872
14873 #undef TARGET_MEMORY_MOVE_COST
14874 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14875
14876 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14877 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14878
14879 #undef TARGET_MUST_PASS_IN_STACK
14880 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14881
14882 /* This target hook should return true if accesses to volatile bitfields
14883    should use the narrowest mode possible.  It should return false if these
14884    accesses should use the bitfield container type.  */
14885 #undef TARGET_NARROW_VOLATILE_BITFIELD
14886 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14887
14888 #undef  TARGET_OPTION_OVERRIDE
14889 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14890
14891 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14892 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14893   aarch64_override_options_after_change
14894
14895 #undef TARGET_OPTION_SAVE
14896 #define TARGET_OPTION_SAVE aarch64_option_save
14897
14898 #undef TARGET_OPTION_RESTORE
14899 #define TARGET_OPTION_RESTORE aarch64_option_restore
14900
14901 #undef TARGET_OPTION_PRINT
14902 #define TARGET_OPTION_PRINT aarch64_option_print
14903
14904 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14905 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14906
14907 #undef TARGET_SET_CURRENT_FUNCTION
14908 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14909
14910 #undef TARGET_PASS_BY_REFERENCE
14911 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14912
14913 #undef TARGET_PREFERRED_RELOAD_CLASS
14914 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14915
14916 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14917 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14918
14919 #undef TARGET_PROMOTED_TYPE
14920 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14921
14922 #undef TARGET_SECONDARY_RELOAD
14923 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14924
14925 #undef TARGET_SHIFT_TRUNCATION_MASK
14926 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14927
14928 #undef TARGET_SETUP_INCOMING_VARARGS
14929 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14930
14931 #undef TARGET_STRUCT_VALUE_RTX
14932 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14933
14934 #undef TARGET_REGISTER_MOVE_COST
14935 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14936
14937 #undef TARGET_RETURN_IN_MEMORY
14938 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14939
14940 #undef TARGET_RETURN_IN_MSB
14941 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14942
14943 #undef TARGET_RTX_COSTS
14944 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14945
14946 #undef TARGET_SCALAR_MODE_SUPPORTED_P
14947 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
14948
14949 #undef TARGET_SCHED_ISSUE_RATE
14950 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14951
14952 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14953 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14954   aarch64_sched_first_cycle_multipass_dfa_lookahead
14955
14956 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14957 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14958   aarch64_first_cycle_multipass_dfa_lookahead_guard
14959
14960 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
14961 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
14962   aarch64_get_separate_components
14963
14964 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
14965 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
14966   aarch64_components_for_bb
14967
14968 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
14969 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
14970   aarch64_disqualify_components
14971
14972 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
14973 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
14974   aarch64_emit_prologue_components
14975
14976 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
14977 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
14978   aarch64_emit_epilogue_components
14979
14980 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
14981 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
14982   aarch64_set_handled_components
14983
14984 #undef TARGET_TRAMPOLINE_INIT
14985 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14986
14987 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14988 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14989
14990 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14991 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14992
14993 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
14994 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
14995   aarch64_builtin_support_vector_misalignment
14996
14997 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14998 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14999
15000 #undef TARGET_VECTORIZE_ADD_STMT_COST
15001 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15002
15003 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15004 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15005   aarch64_builtin_vectorization_cost
15006
15007 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15008 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15009
15010 #undef TARGET_VECTORIZE_BUILTINS
15011 #define TARGET_VECTORIZE_BUILTINS
15012
15013 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15014 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15015   aarch64_builtin_vectorized_function
15016
15017 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15018 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15019   aarch64_autovectorize_vector_sizes
15020
15021 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15022 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15023   aarch64_atomic_assign_expand_fenv
15024
15025 /* Section anchor support.  */
15026
15027 #undef TARGET_MIN_ANCHOR_OFFSET
15028 #define TARGET_MIN_ANCHOR_OFFSET -256
15029
15030 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15031    byte offset; we can do much more for larger data types, but have no way
15032    to determine the size of the access.  We assume accesses are aligned.  */
15033 #undef TARGET_MAX_ANCHOR_OFFSET
15034 #define TARGET_MAX_ANCHOR_OFFSET 4095
15035
15036 #undef TARGET_VECTOR_ALIGNMENT
15037 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15038
15039 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15040 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15041   aarch64_simd_vector_alignment_reachable
15042
15043 /* vec_perm support.  */
15044
15045 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15046 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15047   aarch64_vectorize_vec_perm_const_ok
15048
15049 #undef TARGET_INIT_LIBFUNCS
15050 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15051
15052 #undef TARGET_FIXED_CONDITION_CODE_REGS
15053 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15054
15055 #undef TARGET_FLAGS_REGNUM
15056 #define TARGET_FLAGS_REGNUM CC_REGNUM
15057
15058 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15059 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15060
15061 #undef TARGET_ASAN_SHADOW_OFFSET
15062 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15063
15064 #undef TARGET_LEGITIMIZE_ADDRESS
15065 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15066
15067 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15068 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15069   aarch64_use_by_pieces_infrastructure_p
15070
15071 #undef TARGET_CAN_USE_DOLOOP_P
15072 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15073
15074 #undef TARGET_SCHED_ADJUST_PRIORITY
15075 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15076
15077 #undef TARGET_SCHED_MACRO_FUSION_P
15078 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15079
15080 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15081 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15082
15083 #undef TARGET_SCHED_FUSION_PRIORITY
15084 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15085
15086 #undef TARGET_UNSPEC_MAY_TRAP_P
15087 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15088
15089 #undef TARGET_USE_PSEUDO_PIC_REG
15090 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15091
15092 #undef TARGET_PRINT_OPERAND
15093 #define TARGET_PRINT_OPERAND aarch64_print_operand
15094
15095 #undef TARGET_PRINT_OPERAND_ADDRESS
15096 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15097
15098 #undef TARGET_OPTAB_SUPPORTED_P
15099 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15100
15101 #undef TARGET_OMIT_STRUCT_RETURN_REG
15102 #define TARGET_OMIT_STRUCT_RETURN_REG true
15103
15104 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15105 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15106 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15107
15108 #if CHECKING_P
15109 #undef TARGET_RUN_TARGET_SELFTESTS
15110 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15111 #endif /* #if CHECKING_P */
15112
15113 struct gcc_target targetm = TARGET_INITIALIZER;
15114
15115 #include "gt-aarch64.h"