gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67 #include "selftest.h"
  68 #include "selftest-rtl.h"
  69
  70 /* This file should be included last.  */
  71 #include "target-def.h"
  72
  73 /* Defined for convenience.  */
  74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  75
  76 /* Classifies an address.
  77
  78    ADDRESS_REG_IMM
  79        A simple base register plus immediate offset.
  80
  81    ADDRESS_REG_WB
  82        A base register indexed by immediate offset with writeback.
  83
  84    ADDRESS_REG_REG
  85        A base register indexed by (optionally scaled) register.
  86
  87    ADDRESS_REG_UXTW
  88        A base register indexed by (optionally scaled) zero-extended register.
  89
  90    ADDRESS_REG_SXTW
  91        A base register indexed by (optionally scaled) sign-extended register.
  92
  93    ADDRESS_LO_SUM
  94        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  95
  96    ADDRESS_SYMBOLIC:
  97        A constant symbolic address, in pc-relative literal pool.  */
  98
  99 enum aarch64_address_type {
 100   ADDRESS_REG_IMM,
 101   ADDRESS_REG_WB,
 102   ADDRESS_REG_REG,
 103   ADDRESS_REG_UXTW,
 104   ADDRESS_REG_SXTW,
 105   ADDRESS_LO_SUM,
 106   ADDRESS_SYMBOLIC
 107 };
 108
 109 struct aarch64_address_info {
 110   enum aarch64_address_type type;
 111   rtx base;
 112   rtx offset;
 113   int shift;
 114   enum aarch64_symbol_type symbol_type;
 115 };
 116
 117 struct simd_immediate_info
 118 {
 119   rtx value;
 120   int shift;
 121   int element_width;
 122   bool mvn;
 123   bool msl;
 124 };
 125
 126 /* The current code model.  */
 127 enum aarch64_code_model aarch64_cmodel;
 128
 129 #ifdef HAVE_AS_TLS
 130 #undef TARGET_HAVE_TLS
 131 #define TARGET_HAVE_TLS 1
 132 #endif
 133
 134 static bool aarch64_composite_type_p (const_tree, machine_mode);
 135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 136                                                      const_tree,
 137                                                      machine_mode *, int *,
 138                                                      bool *);
 139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_override_options_after_change (void);
 142 static bool aarch64_vector_mode_supported_p (machine_mode);
 143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 144                                                  const unsigned char *sel);
 145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 147                                                          const_tree type,
 148                                                          int misalignment,
 149                                                          bool is_packed);
 150 static machine_mode
 151 aarch64_simd_container_mode (machine_mode mode, unsigned width);
 152
 153 /* Major revision number of the ARM Architecture implemented by the target.  */
 154 unsigned aarch64_architecture_version;
 155
 156 /* The processor for which instructions should be scheduled.  */
 157 enum aarch64_processor aarch64_tune = cortexa53;
 158
 159 /* Mask to specify which instruction scheduling options should be used.  */
 160 unsigned long aarch64_tune_flags = 0;
 161
 162 /* Global flag for PC relative loads.  */
 163 bool aarch64_pcrelative_literal_loads;
 164
 165 /* Support for command line parsing of boolean flags in the tuning
 166    structures.  */
 167 struct aarch64_flag_desc
 168 {
 169   const char* name;
 170   unsigned int flag;
 171 };
 172
 173 #define AARCH64_FUSION_PAIR(name, internal_name) \
 174   { name, AARCH64_FUSE_##internal_name },
 175 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 176 {
 177   { "none", AARCH64_FUSE_NOTHING },
 178 #include "aarch64-fusion-pairs.def"
 179   { "all", AARCH64_FUSE_ALL },
 180   { NULL, AARCH64_FUSE_NOTHING }
 181 };
 182
 183 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 184   { name, AARCH64_EXTRA_TUNE_##internal_name },
 185 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 186 {
 187   { "none", AARCH64_EXTRA_TUNE_NONE },
 188 #include "aarch64-tuning-flags.def"
 189   { "all", AARCH64_EXTRA_TUNE_ALL },
 190   { NULL, AARCH64_EXTRA_TUNE_NONE }
 191 };
 192
 193 /* Tuning parameters.  */
 194
 195 static const struct cpu_addrcost_table generic_addrcost_table =
 196 {
 197     {
 198       1, /* hi  */
 199       0, /* si  */
 200       0, /* di  */
 201       1, /* ti  */
 202     },
 203   0, /* pre_modify  */
 204   0, /* post_modify  */
 205   0, /* register_offset  */
 206   0, /* register_sextend  */
 207   0, /* register_zextend  */
 208   0 /* imm_offset  */
 209 };
 210
 211 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 212 {
 213     {
 214       0, /* hi  */
 215       0, /* si  */
 216       0, /* di  */
 217       2, /* ti  */
 218     },
 219   0, /* pre_modify  */
 220   0, /* post_modify  */
 221   1, /* register_offset  */
 222   1, /* register_sextend  */
 223   2, /* register_zextend  */
 224   0, /* imm_offset  */
 225 };
 226
 227 static const struct cpu_addrcost_table xgene1_addrcost_table =
 228 {
 229     {
 230       1, /* hi  */
 231       0, /* si  */
 232       0, /* di  */
 233       1, /* ti  */
 234     },
 235   1, /* pre_modify  */
 236   0, /* post_modify  */
 237   0, /* register_offset  */
 238   1, /* register_sextend  */
 239   1, /* register_zextend  */
 240   0, /* imm_offset  */
 241 };
 242
 243 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 244 {
 245     {
 246       1, /* hi  */
 247       1, /* si  */
 248       1, /* di  */
 249       2, /* ti  */
 250     },
 251   0, /* pre_modify  */
 252   0, /* post_modify  */
 253   2, /* register_offset  */
 254   3, /* register_sextend  */
 255   3, /* register_zextend  */
 256   0, /* imm_offset  */
 257 };
 258
 259 static const struct cpu_regmove_cost generic_regmove_cost =
 260 {
 261   1, /* GP2GP  */
 262   /* Avoid the use of slow int<->fp moves for spilling by setting
 263      their cost higher than memmov_cost.  */
 264   5, /* GP2FP  */
 265   5, /* FP2GP  */
 266   2 /* FP2FP  */
 267 };
 268
 269 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 270 {
 271   1, /* GP2GP  */
 272   /* Avoid the use of slow int<->fp moves for spilling by setting
 273      their cost higher than memmov_cost.  */
 274   5, /* GP2FP  */
 275   5, /* FP2GP  */
 276   2 /* FP2FP  */
 277 };
 278
 279 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 280 {
 281   1, /* GP2GP  */
 282   /* Avoid the use of slow int<->fp moves for spilling by setting
 283      their cost higher than memmov_cost.  */
 284   5, /* GP2FP  */
 285   5, /* FP2GP  */
 286   2 /* FP2FP  */
 287 };
 288
 289 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 290 {
 291   1, /* GP2GP  */
 292   /* Avoid the use of slow int<->fp moves for spilling by setting
 293      their cost higher than memmov_cost (actual, 4 and 9).  */
 294   9, /* GP2FP  */
 295   9, /* FP2GP  */
 296   1 /* FP2FP  */
 297 };
 298
 299 static const struct cpu_regmove_cost thunderx_regmove_cost =
 300 {
 301   2, /* GP2GP  */
 302   2, /* GP2FP  */
 303   6, /* FP2GP  */
 304   4 /* FP2FP  */
 305 };
 306
 307 static const struct cpu_regmove_cost xgene1_regmove_cost =
 308 {
 309   1, /* GP2GP  */
 310   /* Avoid the use of slow int<->fp moves for spilling by setting
 311      their cost higher than memmov_cost.  */
 312   8, /* GP2FP  */
 313   8, /* FP2GP  */
 314   2 /* FP2FP  */
 315 };
 316
 317 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 318 {
 319   2, /* GP2GP  */
 320   /* Avoid the use of int<->fp moves for spilling.  */
 321   6, /* GP2FP  */
 322   6, /* FP2GP  */
 323   4 /* FP2FP  */
 324 };
 325
 326 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 327 {
 328   1, /* GP2GP  */
 329   /* Avoid the use of int<->fp moves for spilling.  */
 330   8, /* GP2FP  */
 331   8, /* FP2GP  */
 332   4  /* FP2FP  */
 333 };
 334
 335 /* Generic costs for vector insn classes.  */
 336 static const struct cpu_vector_cost generic_vector_cost =
 337 {
 338   1, /* scalar_int_stmt_cost  */
 339   1, /* scalar_fp_stmt_cost  */
 340   1, /* scalar_load_cost  */
 341   1, /* scalar_store_cost  */
 342   1, /* vec_int_stmt_cost  */
 343   1, /* vec_fp_stmt_cost  */
 344   2, /* vec_permute_cost  */
 345   1, /* vec_to_scalar_cost  */
 346   1, /* scalar_to_vec_cost  */
 347   1, /* vec_align_load_cost  */
 348   1, /* vec_unalign_load_cost  */
 349   1, /* vec_unalign_store_cost  */
 350   1, /* vec_store_cost  */
 351   3, /* cond_taken_branch_cost  */
 352   1 /* cond_not_taken_branch_cost  */
 353 };
 354
 355 /* ThunderX costs for vector insn classes.  */
 356 static const struct cpu_vector_cost thunderx_vector_cost =
 357 {
 358   1, /* scalar_int_stmt_cost  */
 359   1, /* scalar_fp_stmt_cost  */
 360   3, /* scalar_load_cost  */
 361   1, /* scalar_store_cost  */
 362   4, /* vec_int_stmt_cost  */
 363   1, /* vec_fp_stmt_cost  */
 364   4, /* vec_permute_cost  */
 365   2, /* vec_to_scalar_cost  */
 366   2, /* scalar_to_vec_cost  */
 367   3, /* vec_align_load_cost  */
 368   5, /* vec_unalign_load_cost  */
 369   5, /* vec_unalign_store_cost  */
 370   1, /* vec_store_cost  */
 371   3, /* cond_taken_branch_cost  */
 372   3 /* cond_not_taken_branch_cost  */
 373 };
 374
 375 /* Generic costs for vector insn classes.  */
 376 static const struct cpu_vector_cost cortexa57_vector_cost =
 377 {
 378   1, /* scalar_int_stmt_cost  */
 379   1, /* scalar_fp_stmt_cost  */
 380   4, /* scalar_load_cost  */
 381   1, /* scalar_store_cost  */
 382   2, /* vec_int_stmt_cost  */
 383   2, /* vec_fp_stmt_cost  */
 384   3, /* vec_permute_cost  */
 385   8, /* vec_to_scalar_cost  */
 386   8, /* scalar_to_vec_cost  */
 387   4, /* vec_align_load_cost  */
 388   4, /* vec_unalign_load_cost  */
 389   1, /* vec_unalign_store_cost  */
 390   1, /* vec_store_cost  */
 391   1, /* cond_taken_branch_cost  */
 392   1 /* cond_not_taken_branch_cost  */
 393 };
 394
 395 static const struct cpu_vector_cost exynosm1_vector_cost =
 396 {
 397   1, /* scalar_int_stmt_cost  */
 398   1, /* scalar_fp_stmt_cost  */
 399   5, /* scalar_load_cost  */
 400   1, /* scalar_store_cost  */
 401   3, /* vec_int_stmt_cost  */
 402   3, /* vec_fp_stmt_cost  */
 403   3, /* vec_permute_cost  */
 404   3, /* vec_to_scalar_cost  */
 405   3, /* scalar_to_vec_cost  */
 406   5, /* vec_align_load_cost  */
 407   5, /* vec_unalign_load_cost  */
 408   1, /* vec_unalign_store_cost  */
 409   1, /* vec_store_cost  */
 410   1, /* cond_taken_branch_cost  */
 411   1 /* cond_not_taken_branch_cost  */
 412 };
 413
 414 /* Generic costs for vector insn classes.  */
 415 static const struct cpu_vector_cost xgene1_vector_cost =
 416 {
 417   1, /* scalar_int_stmt_cost  */
 418   1, /* scalar_fp_stmt_cost  */
 419   5, /* scalar_load_cost  */
 420   1, /* scalar_store_cost  */
 421   2, /* vec_int_stmt_cost  */
 422   2, /* vec_fp_stmt_cost  */
 423   2, /* vec_permute_cost  */
 424   4, /* vec_to_scalar_cost  */
 425   4, /* scalar_to_vec_cost  */
 426   10, /* vec_align_load_cost  */
 427   10, /* vec_unalign_load_cost  */
 428   2, /* vec_unalign_store_cost  */
 429   2, /* vec_store_cost  */
 430   2, /* cond_taken_branch_cost  */
 431   1 /* cond_not_taken_branch_cost  */
 432 };
 433
 434 /* Costs for vector insn classes for Vulcan.  */
 435 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 436 {
 437   1, /* scalar_int_stmt_cost  */
 438   6, /* scalar_fp_stmt_cost  */
 439   4, /* scalar_load_cost  */
 440   1, /* scalar_store_cost  */
 441   5, /* vec_int_stmt_cost  */
 442   6, /* vec_fp_stmt_cost  */
 443   3, /* vec_permute_cost  */
 444   6, /* vec_to_scalar_cost  */
 445   5, /* scalar_to_vec_cost  */
 446   8, /* vec_align_load_cost  */
 447   8, /* vec_unalign_load_cost  */
 448   4, /* vec_unalign_store_cost  */
 449   4, /* vec_store_cost  */
 450   2, /* cond_taken_branch_cost  */
 451   1  /* cond_not_taken_branch_cost  */
 452 };
 453
 454 /* Generic costs for branch instructions.  */
 455 static const struct cpu_branch_cost generic_branch_cost =
 456 {
 457   1,  /* Predictable.  */
 458   3   /* Unpredictable.  */
 459 };
 460
 461 /* Generic approximation modes.  */
 462 static const cpu_approx_modes generic_approx_modes =
 463 {
 464   AARCH64_APPROX_NONE,  /* division  */
 465   AARCH64_APPROX_NONE,  /* sqrt  */
 466   AARCH64_APPROX_NONE   /* recip_sqrt  */
 467 };
 468
 469 /* Approximation modes for Exynos M1.  */
 470 static const cpu_approx_modes exynosm1_approx_modes =
 471 {
 472   AARCH64_APPROX_NONE,  /* division  */
 473   AARCH64_APPROX_ALL,   /* sqrt  */
 474   AARCH64_APPROX_ALL    /* recip_sqrt  */
 475 };
 476
 477 /* Approximation modes for X-Gene 1.  */
 478 static const cpu_approx_modes xgene1_approx_modes =
 479 {
 480   AARCH64_APPROX_NONE,  /* division  */
 481   AARCH64_APPROX_NONE,  /* sqrt  */
 482   AARCH64_APPROX_ALL    /* recip_sqrt  */
 483 };
 484
 485 /* Generic prefetch settings (which disable prefetch).  */
 486 static const cpu_prefetch_tune generic_prefetch_tune =
 487 {
 488   0,                    /* num_slots  */
 489   -1,                   /* l1_cache_size  */
 490   -1,                   /* l1_cache_line_size  */
 491   -1,                   /* l2_cache_size  */
 492   -1                    /* default_opt_level  */
 493 };
 494
 495 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 496 {
 497   0,                    /* num_slots  */
 498   -1,                   /* l1_cache_size  */
 499   64,                   /* l1_cache_line_size  */
 500   -1,                   /* l2_cache_size  */
 501   -1                    /* default_opt_level  */
 502 };
 503
 504 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 505 {
 506   4,                    /* num_slots  */
 507   32,                   /* l1_cache_size  */
 508   64,                   /* l1_cache_line_size  */
 509   1024,                 /* l2_cache_size  */
 510   3                     /* default_opt_level  */
 511 };
 512
 513 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 514 {
 515   8,                    /* num_slots  */
 516   32,                   /* l1_cache_size  */
 517   128,                  /* l1_cache_line_size  */
 518   16*1024,              /* l2_cache_size  */
 519   3                     /* default_opt_level  */
 520 };
 521
 522 static const cpu_prefetch_tune thunderx_prefetch_tune =
 523 {
 524   8,                    /* num_slots  */
 525   32,                   /* l1_cache_size  */
 526   128,                  /* l1_cache_line_size  */
 527   -1,                   /* l2_cache_size  */
 528   -1                    /* default_opt_level  */
 529 };
 530
 531 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 532 {
 533   8,                    /* num_slots  */
 534   32,                   /* l1_cache_size  */
 535   64,                   /* l1_cache_line_size  */
 536   256,                  /* l2_cache_size  */
 537   -1                    /* default_opt_level  */
 538 };
 539
 540 static const struct tune_params generic_tunings =
 541 {
 542   &cortexa57_extra_costs,
 543   &generic_addrcost_table,
 544   &generic_regmove_cost,
 545   &generic_vector_cost,
 546   &generic_branch_cost,
 547   &generic_approx_modes,
 548   4, /* memmov_cost  */
 549   2, /* issue_rate  */
 550   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 551   8,    /* function_align.  */
 552   4,    /* jump_align.  */
 553   8,    /* loop_align.  */
 554   2,    /* int_reassoc_width.  */
 555   4,    /* fp_reassoc_width.  */
 556   1,    /* vec_reassoc_width.  */
 557   2,    /* min_div_recip_mul_sf.  */
 558   2,    /* min_div_recip_mul_df.  */
 559   0,    /* max_case_values.  */
 560   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 561   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 562   &generic_prefetch_tune
 563 };
 564
 565 static const struct tune_params cortexa35_tunings =
 566 {
 567   &cortexa53_extra_costs,
 568   &generic_addrcost_table,
 569   &cortexa53_regmove_cost,
 570   &generic_vector_cost,
 571   &generic_branch_cost,
 572   &generic_approx_modes,
 573   4, /* memmov_cost  */
 574   1, /* issue_rate  */
 575   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 576    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 577   16,   /* function_align.  */
 578   4,    /* jump_align.  */
 579   8,    /* loop_align.  */
 580   2,    /* int_reassoc_width.  */
 581   4,    /* fp_reassoc_width.  */
 582   1,    /* vec_reassoc_width.  */
 583   2,    /* min_div_recip_mul_sf.  */
 584   2,    /* min_div_recip_mul_df.  */
 585   0,    /* max_case_values.  */
 586   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 587   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 588   &generic_prefetch_tune
 589 };
 590
 591 static const struct tune_params cortexa53_tunings =
 592 {
 593   &cortexa53_extra_costs,
 594   &generic_addrcost_table,
 595   &cortexa53_regmove_cost,
 596   &generic_vector_cost,
 597   &generic_branch_cost,
 598   &generic_approx_modes,
 599   4, /* memmov_cost  */
 600   2, /* issue_rate  */
 601   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 602    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 603   16,   /* function_align.  */
 604   4,    /* jump_align.  */
 605   8,    /* loop_align.  */
 606   2,    /* int_reassoc_width.  */
 607   4,    /* fp_reassoc_width.  */
 608   1,    /* vec_reassoc_width.  */
 609   2,    /* min_div_recip_mul_sf.  */
 610   2,    /* min_div_recip_mul_df.  */
 611   0,    /* max_case_values.  */
 612   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 613   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 614   &generic_prefetch_tune
 615 };
 616
 617 static const struct tune_params cortexa57_tunings =
 618 {
 619   &cortexa57_extra_costs,
 620   &generic_addrcost_table,
 621   &cortexa57_regmove_cost,
 622   &cortexa57_vector_cost,
 623   &generic_branch_cost,
 624   &generic_approx_modes,
 625   4, /* memmov_cost  */
 626   3, /* issue_rate  */
 627   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 628    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 629   16,   /* function_align.  */
 630   4,    /* jump_align.  */
 631   8,    /* loop_align.  */
 632   2,    /* int_reassoc_width.  */
 633   4,    /* fp_reassoc_width.  */
 634   1,    /* vec_reassoc_width.  */
 635   2,    /* min_div_recip_mul_sf.  */
 636   2,    /* min_div_recip_mul_df.  */
 637   0,    /* max_case_values.  */
 638   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 639   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 640   &generic_prefetch_tune
 641 };
 642
 643 static const struct tune_params cortexa72_tunings =
 644 {
 645   &cortexa57_extra_costs,
 646   &generic_addrcost_table,
 647   &cortexa57_regmove_cost,
 648   &cortexa57_vector_cost,
 649   &generic_branch_cost,
 650   &generic_approx_modes,
 651   4, /* memmov_cost  */
 652   3, /* issue_rate  */
 653   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 654    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 655   16,   /* function_align.  */
 656   4,    /* jump_align.  */
 657   8,    /* loop_align.  */
 658   2,    /* int_reassoc_width.  */
 659   4,    /* fp_reassoc_width.  */
 660   1,    /* vec_reassoc_width.  */
 661   2,    /* min_div_recip_mul_sf.  */
 662   2,    /* min_div_recip_mul_df.  */
 663   0,    /* max_case_values.  */
 664   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 665   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 666   &generic_prefetch_tune
 667 };
 668
 669 static const struct tune_params cortexa73_tunings =
 670 {
 671   &cortexa57_extra_costs,
 672   &generic_addrcost_table,
 673   &cortexa57_regmove_cost,
 674   &cortexa57_vector_cost,
 675   &generic_branch_cost,
 676   &generic_approx_modes,
 677   4, /* memmov_cost.  */
 678   2, /* issue_rate.  */
 679   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 680    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 681   16,   /* function_align.  */
 682   4,    /* jump_align.  */
 683   8,    /* loop_align.  */
 684   2,    /* int_reassoc_width.  */
 685   4,    /* fp_reassoc_width.  */
 686   1,    /* vec_reassoc_width.  */
 687   2,    /* min_div_recip_mul_sf.  */
 688   2,    /* min_div_recip_mul_df.  */
 689   0,    /* max_case_values.  */
 690   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 691   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 692   &generic_prefetch_tune
 693 };
 694
 695
 696
 697 static const struct tune_params exynosm1_tunings =
 698 {
 699   &exynosm1_extra_costs,
 700   &exynosm1_addrcost_table,
 701   &exynosm1_regmove_cost,
 702   &exynosm1_vector_cost,
 703   &generic_branch_cost,
 704   &exynosm1_approx_modes,
 705   4,    /* memmov_cost  */
 706   3,    /* issue_rate  */
 707   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 708   4,    /* function_align.  */
 709   4,    /* jump_align.  */
 710   4,    /* loop_align.  */
 711   2,    /* int_reassoc_width.  */
 712   4,    /* fp_reassoc_width.  */
 713   1,    /* vec_reassoc_width.  */
 714   2,    /* min_div_recip_mul_sf.  */
 715   2,    /* min_div_recip_mul_df.  */
 716   48,   /* max_case_values.  */
 717   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 718   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 719   &exynosm1_prefetch_tune
 720 };
 721
 722 static const struct tune_params thunderxt88_tunings =
 723 {
 724   &thunderx_extra_costs,
 725   &generic_addrcost_table,
 726   &thunderx_regmove_cost,
 727   &thunderx_vector_cost,
 728   &generic_branch_cost,
 729   &generic_approx_modes,
 730   6, /* memmov_cost  */
 731   2, /* issue_rate  */
 732   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 733   8,    /* function_align.  */
 734   8,    /* jump_align.  */
 735   8,    /* loop_align.  */
 736   2,    /* int_reassoc_width.  */
 737   4,    /* fp_reassoc_width.  */
 738   1,    /* vec_reassoc_width.  */
 739   2,    /* min_div_recip_mul_sf.  */
 740   2,    /* min_div_recip_mul_df.  */
 741   0,    /* max_case_values.  */
 742   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 743   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 744   &thunderxt88_prefetch_tune
 745 };
 746
 747 static const struct tune_params thunderx_tunings =
 748 {
 749   &thunderx_extra_costs,
 750   &generic_addrcost_table,
 751   &thunderx_regmove_cost,
 752   &thunderx_vector_cost,
 753   &generic_branch_cost,
 754   &generic_approx_modes,
 755   6, /* memmov_cost  */
 756   2, /* issue_rate  */
 757   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 758   8,    /* function_align.  */
 759   8,    /* jump_align.  */
 760   8,    /* loop_align.  */
 761   2,    /* int_reassoc_width.  */
 762   4,    /* fp_reassoc_width.  */
 763   1,    /* vec_reassoc_width.  */
 764   2,    /* min_div_recip_mul_sf.  */
 765   2,    /* min_div_recip_mul_df.  */
 766   0,    /* max_case_values.  */
 767   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 768   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 769    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 770   &thunderx_prefetch_tune
 771 };
 772
 773 static const struct tune_params xgene1_tunings =
 774 {
 775   &xgene1_extra_costs,
 776   &xgene1_addrcost_table,
 777   &xgene1_regmove_cost,
 778   &xgene1_vector_cost,
 779   &generic_branch_cost,
 780   &xgene1_approx_modes,
 781   6, /* memmov_cost  */
 782   4, /* issue_rate  */
 783   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 784   16,   /* function_align.  */
 785   8,    /* jump_align.  */
 786   16,   /* loop_align.  */
 787   2,    /* int_reassoc_width.  */
 788   4,    /* fp_reassoc_width.  */
 789   1,    /* vec_reassoc_width.  */
 790   2,    /* min_div_recip_mul_sf.  */
 791   2,    /* min_div_recip_mul_df.  */
 792   0,    /* max_case_values.  */
 793   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 794   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 795   &generic_prefetch_tune
 796 };
 797
 798 static const struct tune_params qdf24xx_tunings =
 799 {
 800   &qdf24xx_extra_costs,
 801   &generic_addrcost_table,
 802   &qdf24xx_regmove_cost,
 803   &generic_vector_cost,
 804   &generic_branch_cost,
 805   &generic_approx_modes,
 806   4, /* memmov_cost  */
 807   4, /* issue_rate  */
 808   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 809    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 810   16,   /* function_align.  */
 811   8,    /* jump_align.  */
 812   16,   /* loop_align.  */
 813   2,    /* int_reassoc_width.  */
 814   4,    /* fp_reassoc_width.  */
 815   1,    /* vec_reassoc_width.  */
 816   2,    /* min_div_recip_mul_sf.  */
 817   2,    /* min_div_recip_mul_df.  */
 818   0,    /* max_case_values.  */
 819   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 820   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 821   &qdf24xx_prefetch_tune
 822 };
 823
 824 static const struct tune_params thunderx2t99_tunings =
 825 {
 826   &thunderx2t99_extra_costs,
 827   &thunderx2t99_addrcost_table,
 828   &thunderx2t99_regmove_cost,
 829   &thunderx2t99_vector_cost,
 830   &generic_branch_cost,
 831   &generic_approx_modes,
 832   4, /* memmov_cost.  */
 833   4, /* issue_rate.  */
 834   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 835    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 836   16,   /* function_align.  */
 837   8,    /* jump_align.  */
 838   16,   /* loop_align.  */
 839   3,    /* int_reassoc_width.  */
 840   2,    /* fp_reassoc_width.  */
 841   2,    /* vec_reassoc_width.  */
 842   2,    /* min_div_recip_mul_sf.  */
 843   2,    /* min_div_recip_mul_df.  */
 844   0,    /* max_case_values.  */
 845   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 846   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 847   &thunderx2t99_prefetch_tune
 848 };
 849
 850 /* Support for fine-grained override of the tuning structures.  */
 851 struct aarch64_tuning_override_function
 852 {
 853   const char* name;
 854   void (*parse_override)(const char*, struct tune_params*);
 855 };
 856
 857 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 858 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 859
 860 static const struct aarch64_tuning_override_function
 861 aarch64_tuning_override_functions[] =
 862 {
 863   { "fuse", aarch64_parse_fuse_string },
 864   { "tune", aarch64_parse_tune_string },
 865   { NULL, NULL }
 866 };
 867
 868 /* A processor implementing AArch64.  */
 869 struct processor
 870 {
 871   const char *const name;
 872   enum aarch64_processor ident;
 873   enum aarch64_processor sched_core;
 874   enum aarch64_arch arch;
 875   unsigned architecture_version;
 876   const unsigned long flags;
 877   const struct tune_params *const tune;
 878 };
 879
 880 /* Architectures implementing AArch64.  */
 881 static const struct processor all_architectures[] =
 882 {
 883 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 884   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 885 #include "aarch64-arches.def"
 886   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 887 };
 888
 889 /* Processor cores implementing AArch64.  */
 890 static const struct processor all_cores[] =
 891 {
 892 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 893   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 894   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 895   FLAGS, &COSTS##_tunings},
 896 #include "aarch64-cores.def"
 897   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 898     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 899   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 900 };
 901
 902
 903 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 904    handling code or by target attributes.  */
 905 static const struct processor *selected_arch;
 906 static const struct processor *selected_cpu;
 907 static const struct processor *selected_tune;
 908
 909 /* The current tuning set.  */
 910 struct tune_params aarch64_tune_params = generic_tunings;
 911
 912 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 913
 914 /* An ISA extension in the co-processor and main instruction set space.  */
 915 struct aarch64_option_extension
 916 {
 917   const char *const name;
 918   const unsigned long flags_on;
 919   const unsigned long flags_off;
 920 };
 921
 922 typedef enum aarch64_cond_code
 923 {
 924   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 925   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 926   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 927 }
 928 aarch64_cc;
 929
 930 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 931
 932 /* The condition codes of the processor, and the inverse function.  */
 933 static const char * const aarch64_condition_codes[] =
 934 {
 935   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 936   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 937 };
 938
 939 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 940 const char *
 941 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 942                         const char * branch_format)
 943 {
 944     rtx_code_label * tmp_label = gen_label_rtx ();
 945     char label_buf[256];
 946     char buffer[128];
 947     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 948                                  CODE_LABEL_NUMBER (tmp_label));
 949     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 950     rtx dest_label = operands[pos_label];
 951     operands[pos_label] = tmp_label;
 952
 953     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 954     output_asm_insn (buffer, operands);
 955
 956     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 957     operands[pos_label] = dest_label;
 958     output_asm_insn (buffer, operands);
 959     return "";
 960 }
 961
 962 void
 963 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 964 {
 965   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 966   if (TARGET_GENERAL_REGS_ONLY)
 967     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 968   else
 969     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 970 }
 971
 972 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 973    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 974    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 975    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 976    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 977    irrespectively of its cost results in bad allocations with many redundant
 978    int<->FP moves which are expensive on various cores.
 979    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 980    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 981    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 982    Otherwise set the allocno class depending on the mode.
 983    The result of this is that it is no longer inefficient to have a higher
 984    memory move cost than the register move cost.
 985 */
 986
 987 static reg_class_t
 988 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 989                                          reg_class_t best_class)
 990 {
 991   machine_mode mode;
 992
 993   if (allocno_class != ALL_REGS)
 994     return allocno_class;
 995
 996   if (best_class != ALL_REGS)
 997     return best_class;
 998
 999   mode = PSEUDO_REGNO_MODE (regno);
1000   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1001 }
1002
1003 static unsigned int
1004 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1005 {
1006   if (GET_MODE_UNIT_SIZE (mode) == 4)
1007     return aarch64_tune_params.min_div_recip_mul_sf;
1008   return aarch64_tune_params.min_div_recip_mul_df;
1009 }
1010
1011 static int
1012 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1013                              machine_mode mode)
1014 {
1015   if (VECTOR_MODE_P (mode))
1016     return aarch64_tune_params.vec_reassoc_width;
1017   if (INTEGRAL_MODE_P (mode))
1018     return aarch64_tune_params.int_reassoc_width;
1019   if (FLOAT_MODE_P (mode))
1020     return aarch64_tune_params.fp_reassoc_width;
1021   return 1;
1022 }
1023
1024 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1025 unsigned
1026 aarch64_dbx_register_number (unsigned regno)
1027 {
1028    if (GP_REGNUM_P (regno))
1029      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1030    else if (regno == SP_REGNUM)
1031      return AARCH64_DWARF_SP;
1032    else if (FP_REGNUM_P (regno))
1033      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1034
1035    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1036       equivalent DWARF register.  */
1037    return DWARF_FRAME_REGISTERS;
1038 }
1039
1040 /* Return TRUE if MODE is any of the large INT modes.  */
1041 static bool
1042 aarch64_vect_struct_mode_p (machine_mode mode)
1043 {
1044   return mode == OImode || mode == CImode || mode == XImode;
1045 }
1046
1047 /* Return TRUE if MODE is any of the vector modes.  */
1048 static bool
1049 aarch64_vector_mode_p (machine_mode mode)
1050 {
1051   return aarch64_vector_mode_supported_p (mode)
1052          || aarch64_vect_struct_mode_p (mode);
1053 }
1054
1055 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1056 static bool
1057 aarch64_array_mode_supported_p (machine_mode mode,
1058                                 unsigned HOST_WIDE_INT nelems)
1059 {
1060   if (TARGET_SIMD
1061       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1062           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1063       && (nelems >= 2 && nelems <= 4))
1064     return true;
1065
1066   return false;
1067 }
1068
1069 /* Implement HARD_REGNO_NREGS.  */
1070
1071 int
1072 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1073 {
1074   switch (aarch64_regno_regclass (regno))
1075     {
1076     case FP_REGS:
1077     case FP_LO_REGS:
1078       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1079     default:
1080       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1081     }
1082   gcc_unreachable ();
1083 }
1084
1085 /* Implement HARD_REGNO_MODE_OK.  */
1086
1087 int
1088 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1089 {
1090   if (GET_MODE_CLASS (mode) == MODE_CC)
1091     return regno == CC_REGNUM;
1092
1093   if (regno == SP_REGNUM)
1094     /* The purpose of comparing with ptr_mode is to support the
1095        global register variable associated with the stack pointer
1096        register via the syntax of asm ("wsp") in ILP32.  */
1097     return mode == Pmode || mode == ptr_mode;
1098
1099   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1100     return mode == Pmode;
1101
1102   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1103     return 1;
1104
1105   if (FP_REGNUM_P (regno))
1106     {
1107       if (aarch64_vect_struct_mode_p (mode))
1108         return
1109           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1110       else
1111         return 1;
1112     }
1113
1114   return 0;
1115 }
1116
1117 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1118 machine_mode
1119 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1120                                      machine_mode mode)
1121 {
1122   /* Handle modes that fit within single registers.  */
1123   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1124     {
1125       if (GET_MODE_SIZE (mode) >= 4)
1126         return mode;
1127       else
1128         return SImode;
1129     }
1130   /* Fall back to generic for multi-reg and very large modes.  */
1131   else
1132     return choose_hard_reg_mode (regno, nregs, false);
1133 }
1134
1135 /* Return true if calls to DECL should be treated as
1136    long-calls (ie called via a register).  */
1137 static bool
1138 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1139 {
1140   return false;
1141 }
1142
1143 /* Return true if calls to symbol-ref SYM should be treated as
1144    long-calls (ie called via a register).  */
1145 bool
1146 aarch64_is_long_call_p (rtx sym)
1147 {
1148   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1149 }
1150
1151 /* Return true if calls to symbol-ref SYM should not go through
1152    plt stubs.  */
1153
1154 bool
1155 aarch64_is_noplt_call_p (rtx sym)
1156 {
1157   const_tree decl = SYMBOL_REF_DECL (sym);
1158
1159   if (flag_pic
1160       && decl
1161       && (!flag_plt
1162           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1163       && !targetm.binds_local_p (decl))
1164     return true;
1165
1166   return false;
1167 }
1168
1169 /* Return true if the offsets to a zero/sign-extract operation
1170    represent an expression that matches an extend operation.  The
1171    operands represent the paramters from
1172
1173    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1174 bool
1175 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1176                                 rtx extract_imm)
1177 {
1178   HOST_WIDE_INT mult_val, extract_val;
1179
1180   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1181     return false;
1182
1183   mult_val = INTVAL (mult_imm);
1184   extract_val = INTVAL (extract_imm);
1185
1186   if (extract_val > 8
1187       && extract_val < GET_MODE_BITSIZE (mode)
1188       && exact_log2 (extract_val & ~7) > 0
1189       && (extract_val & 7) <= 4
1190       && mult_val == (1 << (extract_val & 7)))
1191     return true;
1192
1193   return false;
1194 }
1195
1196 /* Emit an insn that's a simple single-set.  Both the operands must be
1197    known to be valid.  */
1198 inline static rtx_insn *
1199 emit_set_insn (rtx x, rtx y)
1200 {
1201   return emit_insn (gen_rtx_SET (x, y));
1202 }
1203
1204 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1205    return the rtx for register 0 in the proper mode.  */
1206 rtx
1207 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1208 {
1209   machine_mode mode = SELECT_CC_MODE (code, x, y);
1210   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1211
1212   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1213   return cc_reg;
1214 }
1215
1216 /* Build the SYMBOL_REF for __tls_get_addr.  */
1217
1218 static GTY(()) rtx tls_get_addr_libfunc;
1219
1220 rtx
1221 aarch64_tls_get_addr (void)
1222 {
1223   if (!tls_get_addr_libfunc)
1224     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1225   return tls_get_addr_libfunc;
1226 }
1227
1228 /* Return the TLS model to use for ADDR.  */
1229
1230 static enum tls_model
1231 tls_symbolic_operand_type (rtx addr)
1232 {
1233   enum tls_model tls_kind = TLS_MODEL_NONE;
1234   rtx sym, addend;
1235
1236   if (GET_CODE (addr) == CONST)
1237     {
1238       split_const (addr, &sym, &addend);
1239       if (GET_CODE (sym) == SYMBOL_REF)
1240         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1241     }
1242   else if (GET_CODE (addr) == SYMBOL_REF)
1243     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1244
1245   return tls_kind;
1246 }
1247
1248 /* We'll allow lo_sum's in addresses in our legitimate addresses
1249    so that combine would take care of combining addresses where
1250    necessary, but for generation purposes, we'll generate the address
1251    as :
1252    RTL                               Absolute
1253    tmp = hi (symbol_ref);            adrp  x1, foo
1254    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1255                                      nop
1256
1257    PIC                               TLS
1258    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1259    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1260                                      bl   __tls_get_addr
1261                                      nop
1262
1263    Load TLS symbol, depending on TLS mechanism and TLS access model.
1264
1265    Global Dynamic - Traditional TLS:
1266    adrp tmp, :tlsgd:imm
1267    add  dest, tmp, #:tlsgd_lo12:imm
1268    bl   __tls_get_addr
1269
1270    Global Dynamic - TLS Descriptors:
1271    adrp dest, :tlsdesc:imm
1272    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1273    add  dest, dest, #:tlsdesc_lo12:imm
1274    blr  tmp
1275    mrs  tp, tpidr_el0
1276    add  dest, dest, tp
1277
1278    Initial Exec:
1279    mrs  tp, tpidr_el0
1280    adrp tmp, :gottprel:imm
1281    ldr  dest, [tmp, #:gottprel_lo12:imm]
1282    add  dest, dest, tp
1283
1284    Local Exec:
1285    mrs  tp, tpidr_el0
1286    add  t0, tp, #:tprel_hi12:imm, lsl #12
1287    add  t0, t0, #:tprel_lo12_nc:imm
1288 */
1289
1290 static void
1291 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1292                                    enum aarch64_symbol_type type)
1293 {
1294   switch (type)
1295     {
1296     case SYMBOL_SMALL_ABSOLUTE:
1297       {
1298         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1299         rtx tmp_reg = dest;
1300         machine_mode mode = GET_MODE (dest);
1301
1302         gcc_assert (mode == Pmode || mode == ptr_mode);
1303
1304         if (can_create_pseudo_p ())
1305           tmp_reg = gen_reg_rtx (mode);
1306
1307         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1308         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1309         return;
1310       }
1311
1312     case SYMBOL_TINY_ABSOLUTE:
1313       emit_insn (gen_rtx_SET (dest, imm));
1314       return;
1315
1316     case SYMBOL_SMALL_GOT_28K:
1317       {
1318         machine_mode mode = GET_MODE (dest);
1319         rtx gp_rtx = pic_offset_table_rtx;
1320         rtx insn;
1321         rtx mem;
1322
1323         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1324            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1325            decide rtx costs, in which case pic_offset_table_rtx is not
1326            initialized.  For that case no need to generate the first adrp
1327            instruction as the final cost for global variable access is
1328            one instruction.  */
1329         if (gp_rtx != NULL)
1330           {
1331             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1332                using the page base as GOT base, the first page may be wasted,
1333                in the worst scenario, there is only 28K space for GOT).
1334
1335                The generate instruction sequence for accessing global variable
1336                is:
1337
1338                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1339
1340                Only one instruction needed. But we must initialize
1341                pic_offset_table_rtx properly.  We generate initialize insn for
1342                every global access, and allow CSE to remove all redundant.
1343
1344                The final instruction sequences will look like the following
1345                for multiply global variables access.
1346
1347                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1348
1349                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1350                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1351                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1352                  ...  */
1353
1354             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1355             crtl->uses_pic_offset_table = 1;
1356             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1357
1358             if (mode != GET_MODE (gp_rtx))
1359              gp_rtx = gen_lowpart (mode, gp_rtx);
1360
1361           }
1362
1363         if (mode == ptr_mode)
1364           {
1365             if (mode == DImode)
1366               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1367             else
1368               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1369
1370             mem = XVECEXP (SET_SRC (insn), 0, 0);
1371           }
1372         else
1373           {
1374             gcc_assert (mode == Pmode);
1375
1376             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1377             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1378           }
1379
1380         /* The operand is expected to be MEM.  Whenever the related insn
1381            pattern changed, above code which calculate mem should be
1382            updated.  */
1383         gcc_assert (GET_CODE (mem) == MEM);
1384         MEM_READONLY_P (mem) = 1;
1385         MEM_NOTRAP_P (mem) = 1;
1386         emit_insn (insn);
1387         return;
1388       }
1389
1390     case SYMBOL_SMALL_GOT_4G:
1391       {
1392         /* In ILP32, the mode of dest can be either SImode or DImode,
1393            while the got entry is always of SImode size.  The mode of
1394            dest depends on how dest is used: if dest is assigned to a
1395            pointer (e.g. in the memory), it has SImode; it may have
1396            DImode if dest is dereferenced to access the memeory.
1397            This is why we have to handle three different ldr_got_small
1398            patterns here (two patterns for ILP32).  */
1399
1400         rtx insn;
1401         rtx mem;
1402         rtx tmp_reg = dest;
1403         machine_mode mode = GET_MODE (dest);
1404
1405         if (can_create_pseudo_p ())
1406           tmp_reg = gen_reg_rtx (mode);
1407
1408         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1409         if (mode == ptr_mode)
1410           {
1411             if (mode == DImode)
1412               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1413             else
1414               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1415
1416             mem = XVECEXP (SET_SRC (insn), 0, 0);
1417           }
1418         else
1419           {
1420             gcc_assert (mode == Pmode);
1421
1422             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1423             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1424           }
1425
1426         gcc_assert (GET_CODE (mem) == MEM);
1427         MEM_READONLY_P (mem) = 1;
1428         MEM_NOTRAP_P (mem) = 1;
1429         emit_insn (insn);
1430         return;
1431       }
1432
1433     case SYMBOL_SMALL_TLSGD:
1434       {
1435         rtx_insn *insns;
1436         machine_mode mode = GET_MODE (dest);
1437         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1438
1439         start_sequence ();
1440         if (TARGET_ILP32)
1441           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1442         else
1443           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1444         insns = get_insns ();
1445         end_sequence ();
1446
1447         RTL_CONST_CALL_P (insns) = 1;
1448         emit_libcall_block (insns, dest, result, imm);
1449         return;
1450       }
1451
1452     case SYMBOL_SMALL_TLSDESC:
1453       {
1454         machine_mode mode = GET_MODE (dest);
1455         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1456         rtx tp;
1457
1458         gcc_assert (mode == Pmode || mode == ptr_mode);
1459
1460         /* In ILP32, the got entry is always of SImode size.  Unlike
1461            small GOT, the dest is fixed at reg 0.  */
1462         if (TARGET_ILP32)
1463           emit_insn (gen_tlsdesc_small_si (imm));
1464         else
1465           emit_insn (gen_tlsdesc_small_di (imm));
1466         tp = aarch64_load_tp (NULL);
1467
1468         if (mode != Pmode)
1469           tp = gen_lowpart (mode, tp);
1470
1471         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1472         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1473         return;
1474       }
1475
1476     case SYMBOL_SMALL_TLSIE:
1477       {
1478         /* In ILP32, the mode of dest can be either SImode or DImode,
1479            while the got entry is always of SImode size.  The mode of
1480            dest depends on how dest is used: if dest is assigned to a
1481            pointer (e.g. in the memory), it has SImode; it may have
1482            DImode if dest is dereferenced to access the memeory.
1483            This is why we have to handle three different tlsie_small
1484            patterns here (two patterns for ILP32).  */
1485         machine_mode mode = GET_MODE (dest);
1486         rtx tmp_reg = gen_reg_rtx (mode);
1487         rtx tp = aarch64_load_tp (NULL);
1488
1489         if (mode == ptr_mode)
1490           {
1491             if (mode == DImode)
1492               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1493             else
1494               {
1495                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1496                 tp = gen_lowpart (mode, tp);
1497               }
1498           }
1499         else
1500           {
1501             gcc_assert (mode == Pmode);
1502             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1503           }
1504
1505         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1506         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1507         return;
1508       }
1509
1510     case SYMBOL_TLSLE12:
1511     case SYMBOL_TLSLE24:
1512     case SYMBOL_TLSLE32:
1513     case SYMBOL_TLSLE48:
1514       {
1515         machine_mode mode = GET_MODE (dest);
1516         rtx tp = aarch64_load_tp (NULL);
1517
1518         if (mode != Pmode)
1519           tp = gen_lowpart (mode, tp);
1520
1521         switch (type)
1522           {
1523           case SYMBOL_TLSLE12:
1524             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1525                         (dest, tp, imm));
1526             break;
1527           case SYMBOL_TLSLE24:
1528             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1529                         (dest, tp, imm));
1530           break;
1531           case SYMBOL_TLSLE32:
1532             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1533                         (dest, imm));
1534             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1535                         (dest, dest, tp));
1536           break;
1537           case SYMBOL_TLSLE48:
1538             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1539                         (dest, imm));
1540             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1541                         (dest, dest, tp));
1542             break;
1543           default:
1544             gcc_unreachable ();
1545           }
1546
1547         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1548         return;
1549       }
1550
1551     case SYMBOL_TINY_GOT:
1552       emit_insn (gen_ldr_got_tiny (dest, imm));
1553       return;
1554
1555     case SYMBOL_TINY_TLSIE:
1556       {
1557         machine_mode mode = GET_MODE (dest);
1558         rtx tp = aarch64_load_tp (NULL);
1559
1560         if (mode == ptr_mode)
1561           {
1562             if (mode == DImode)
1563               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1564             else
1565               {
1566                 tp = gen_lowpart (mode, tp);
1567                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1568               }
1569           }
1570         else
1571           {
1572             gcc_assert (mode == Pmode);
1573             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1574           }
1575
1576         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1577         return;
1578       }
1579
1580     default:
1581       gcc_unreachable ();
1582     }
1583 }
1584
1585 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1586    handle all moves if !can_create_pseudo_p ().  The distinction is
1587    important because, unlike emit_move_insn, the move expanders know
1588    how to force Pmode objects into the constant pool even when the
1589    constant pool address is not itself legitimate.  */
1590 static rtx
1591 aarch64_emit_move (rtx dest, rtx src)
1592 {
1593   return (can_create_pseudo_p ()
1594           ? emit_move_insn (dest, src)
1595           : emit_move_insn_1 (dest, src));
1596 }
1597
1598 /* Split a 128-bit move operation into two 64-bit move operations,
1599    taking care to handle partial overlap of register to register
1600    copies.  Special cases are needed when moving between GP regs and
1601    FP regs.  SRC can be a register, constant or memory; DST a register
1602    or memory.  If either operand is memory it must not have any side
1603    effects.  */
1604 void
1605 aarch64_split_128bit_move (rtx dst, rtx src)
1606 {
1607   rtx dst_lo, dst_hi;
1608   rtx src_lo, src_hi;
1609
1610   machine_mode mode = GET_MODE (dst);
1611
1612   gcc_assert (mode == TImode || mode == TFmode);
1613   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1614   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1615
1616   if (REG_P (dst) && REG_P (src))
1617     {
1618       int src_regno = REGNO (src);
1619       int dst_regno = REGNO (dst);
1620
1621       /* Handle FP <-> GP regs.  */
1622       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1623         {
1624           src_lo = gen_lowpart (word_mode, src);
1625           src_hi = gen_highpart (word_mode, src);
1626
1627           if (mode == TImode)
1628             {
1629               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1630               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1631             }
1632           else
1633             {
1634               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1635               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1636             }
1637           return;
1638         }
1639       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1640         {
1641           dst_lo = gen_lowpart (word_mode, dst);
1642           dst_hi = gen_highpart (word_mode, dst);
1643
1644           if (mode == TImode)
1645             {
1646               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1647               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1648             }
1649           else
1650             {
1651               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1652               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1653             }
1654           return;
1655         }
1656     }
1657
1658   dst_lo = gen_lowpart (word_mode, dst);
1659   dst_hi = gen_highpart (word_mode, dst);
1660   src_lo = gen_lowpart (word_mode, src);
1661   src_hi = gen_highpart_mode (word_mode, mode, src);
1662
1663   /* At most one pairing may overlap.  */
1664   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1665     {
1666       aarch64_emit_move (dst_hi, src_hi);
1667       aarch64_emit_move (dst_lo, src_lo);
1668     }
1669   else
1670     {
1671       aarch64_emit_move (dst_lo, src_lo);
1672       aarch64_emit_move (dst_hi, src_hi);
1673     }
1674 }
1675
1676 bool
1677 aarch64_split_128bit_move_p (rtx dst, rtx src)
1678 {
1679   return (! REG_P (src)
1680           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1681 }
1682
1683 /* Split a complex SIMD combine.  */
1684
1685 void
1686 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1687 {
1688   machine_mode src_mode = GET_MODE (src1);
1689   machine_mode dst_mode = GET_MODE (dst);
1690
1691   gcc_assert (VECTOR_MODE_P (dst_mode));
1692   gcc_assert (register_operand (dst, dst_mode)
1693               && register_operand (src1, src_mode)
1694               && register_operand (src2, src_mode));
1695
1696   rtx (*gen) (rtx, rtx, rtx);
1697
1698   switch (src_mode)
1699     {
1700     case V8QImode:
1701       gen = gen_aarch64_simd_combinev8qi;
1702       break;
1703     case V4HImode:
1704       gen = gen_aarch64_simd_combinev4hi;
1705       break;
1706     case V2SImode:
1707       gen = gen_aarch64_simd_combinev2si;
1708       break;
1709     case V4HFmode:
1710       gen = gen_aarch64_simd_combinev4hf;
1711       break;
1712     case V2SFmode:
1713       gen = gen_aarch64_simd_combinev2sf;
1714       break;
1715     case DImode:
1716       gen = gen_aarch64_simd_combinedi;
1717       break;
1718     case DFmode:
1719       gen = gen_aarch64_simd_combinedf;
1720       break;
1721     default:
1722       gcc_unreachable ();
1723     }
1724
1725   emit_insn (gen (dst, src1, src2));
1726   return;
1727 }
1728
1729 /* Split a complex SIMD move.  */
1730
1731 void
1732 aarch64_split_simd_move (rtx dst, rtx src)
1733 {
1734   machine_mode src_mode = GET_MODE (src);
1735   machine_mode dst_mode = GET_MODE (dst);
1736
1737   gcc_assert (VECTOR_MODE_P (dst_mode));
1738
1739   if (REG_P (dst) && REG_P (src))
1740     {
1741       rtx (*gen) (rtx, rtx);
1742
1743       gcc_assert (VECTOR_MODE_P (src_mode));
1744
1745       switch (src_mode)
1746         {
1747         case V16QImode:
1748           gen = gen_aarch64_split_simd_movv16qi;
1749           break;
1750         case V8HImode:
1751           gen = gen_aarch64_split_simd_movv8hi;
1752           break;
1753         case V4SImode:
1754           gen = gen_aarch64_split_simd_movv4si;
1755           break;
1756         case V2DImode:
1757           gen = gen_aarch64_split_simd_movv2di;
1758           break;
1759         case V8HFmode:
1760           gen = gen_aarch64_split_simd_movv8hf;
1761           break;
1762         case V4SFmode:
1763           gen = gen_aarch64_split_simd_movv4sf;
1764           break;
1765         case V2DFmode:
1766           gen = gen_aarch64_split_simd_movv2df;
1767           break;
1768         default:
1769           gcc_unreachable ();
1770         }
1771
1772       emit_insn (gen (dst, src));
1773       return;
1774     }
1775 }
1776
1777 bool
1778 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1779                               machine_mode ymode, rtx y)
1780 {
1781   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1782   gcc_assert (r != NULL);
1783   return rtx_equal_p (x, r);
1784 }
1785
1786
1787 static rtx
1788 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1789 {
1790   if (can_create_pseudo_p ())
1791     return force_reg (mode, value);
1792   else
1793     {
1794       x = aarch64_emit_move (x, value);
1795       return x;
1796     }
1797 }
1798
1799
1800 static rtx
1801 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1802 {
1803   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1804     {
1805       rtx high;
1806       /* Load the full offset into a register.  This
1807          might be improvable in the future.  */
1808       high = GEN_INT (offset);
1809       offset = 0;
1810       high = aarch64_force_temporary (mode, temp, high);
1811       reg = aarch64_force_temporary (mode, temp,
1812                                      gen_rtx_PLUS (mode, high, reg));
1813     }
1814   return plus_constant (mode, reg, offset);
1815 }
1816
1817 static int
1818 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1819                                 machine_mode mode)
1820 {
1821   int i;
1822   unsigned HOST_WIDE_INT val, val2, mask;
1823   int one_match, zero_match;
1824   int num_insns;
1825
1826   val = INTVAL (imm);
1827
1828   if (aarch64_move_imm (val, mode))
1829     {
1830       if (generate)
1831         emit_insn (gen_rtx_SET (dest, imm));
1832       return 1;
1833     }
1834
1835   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1836      (with XXXX non-zero). In that case check to see if the move can be done in
1837      a smaller mode.  */
1838   val2 = val & 0xffffffff;
1839   if (mode == DImode
1840       && aarch64_move_imm (val2, SImode)
1841       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1842     {
1843       if (generate)
1844         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1845
1846       /* Check if we have to emit a second instruction by checking to see
1847          if any of the upper 32 bits of the original DI mode value is set.  */
1848       if (val == val2)
1849         return 1;
1850
1851       i = (val >> 48) ? 48 : 32;
1852
1853       if (generate)
1854          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1855                                     GEN_INT ((val >> i) & 0xffff)));
1856
1857       return 2;
1858     }
1859
1860   if ((val >> 32) == 0 || mode == SImode)
1861     {
1862       if (generate)
1863         {
1864           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1865           if (mode == SImode)
1866             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1867                                        GEN_INT ((val >> 16) & 0xffff)));
1868           else
1869             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1870                                        GEN_INT ((val >> 16) & 0xffff)));
1871         }
1872       return 2;
1873     }
1874
1875   /* Remaining cases are all for DImode.  */
1876
1877   mask = 0xffff;
1878   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1879     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1880   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1881     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1882
1883   if (zero_match != 2 && one_match != 2)
1884     {
1885       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1886          For a 64-bit bitmask try whether changing 16 bits to all ones or
1887          zeroes creates a valid bitmask.  To check any repeated bitmask,
1888          try using 16 bits from the other 32-bit half of val.  */
1889
1890       for (i = 0; i < 64; i += 16, mask <<= 16)
1891         {
1892           val2 = val & ~mask;
1893           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1894             break;
1895           val2 = val | mask;
1896           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1897             break;
1898           val2 = val2 & ~mask;
1899           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1900           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1901             break;
1902         }
1903       if (i != 64)
1904         {
1905           if (generate)
1906             {
1907               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1908               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1909                                          GEN_INT ((val >> i) & 0xffff)));
1910             }
1911           return 2;
1912         }
1913     }
1914
1915   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1916      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1917      otherwise skip zero bits.  */
1918
1919   num_insns = 1;
1920   mask = 0xffff;
1921   val2 = one_match > zero_match ? ~val : val;
1922   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1923
1924   if (generate)
1925     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1926                                            ? (val | ~(mask << i))
1927                                            : (val & (mask << i)))));
1928   for (i += 16; i < 64; i += 16)
1929     {
1930       if ((val2 & (mask << i)) == 0)
1931         continue;
1932       if (generate)
1933         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1934                                    GEN_INT ((val >> i) & 0xffff)));
1935       num_insns ++;
1936     }
1937
1938   return num_insns;
1939 }
1940
1941
1942 void
1943 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1944 {
1945   machine_mode mode = GET_MODE (dest);
1946
1947   gcc_assert (mode == SImode || mode == DImode);
1948
1949   /* Check on what type of symbol it is.  */
1950   if (GET_CODE (imm) == SYMBOL_REF
1951       || GET_CODE (imm) == LABEL_REF
1952       || GET_CODE (imm) == CONST)
1953     {
1954       rtx mem, base, offset;
1955       enum aarch64_symbol_type sty;
1956
1957       /* If we have (const (plus symbol offset)), separate out the offset
1958          before we start classifying the symbol.  */
1959       split_const (imm, &base, &offset);
1960
1961       sty = aarch64_classify_symbol (base, offset);
1962       switch (sty)
1963         {
1964         case SYMBOL_FORCE_TO_MEM:
1965           if (offset != const0_rtx
1966               && targetm.cannot_force_const_mem (mode, imm))
1967             {
1968               gcc_assert (can_create_pseudo_p ());
1969               base = aarch64_force_temporary (mode, dest, base);
1970               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1971               aarch64_emit_move (dest, base);
1972               return;
1973             }
1974
1975           mem = force_const_mem (ptr_mode, imm);
1976           gcc_assert (mem);
1977
1978           /* If we aren't generating PC relative literals, then
1979              we need to expand the literal pool access carefully.
1980              This is something that needs to be done in a number
1981              of places, so could well live as a separate function.  */
1982           if (!aarch64_pcrelative_literal_loads)
1983             {
1984               gcc_assert (can_create_pseudo_p ());
1985               base = gen_reg_rtx (ptr_mode);
1986               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1987               if (ptr_mode != Pmode)
1988                 base = convert_memory_address (Pmode, base);
1989               mem = gen_rtx_MEM (ptr_mode, base);
1990             }
1991
1992           if (mode != ptr_mode)
1993             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1994
1995           emit_insn (gen_rtx_SET (dest, mem));
1996
1997           return;
1998
1999         case SYMBOL_SMALL_TLSGD:
2000         case SYMBOL_SMALL_TLSDESC:
2001         case SYMBOL_SMALL_TLSIE:
2002         case SYMBOL_SMALL_GOT_28K:
2003         case SYMBOL_SMALL_GOT_4G:
2004         case SYMBOL_TINY_GOT:
2005         case SYMBOL_TINY_TLSIE:
2006           if (offset != const0_rtx)
2007             {
2008               gcc_assert(can_create_pseudo_p ());
2009               base = aarch64_force_temporary (mode, dest, base);
2010               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2011               aarch64_emit_move (dest, base);
2012               return;
2013             }
2014           /* FALLTHRU */
2015
2016         case SYMBOL_SMALL_ABSOLUTE:
2017         case SYMBOL_TINY_ABSOLUTE:
2018         case SYMBOL_TLSLE12:
2019         case SYMBOL_TLSLE24:
2020         case SYMBOL_TLSLE32:
2021         case SYMBOL_TLSLE48:
2022           aarch64_load_symref_appropriately (dest, imm, sty);
2023           return;
2024
2025         default:
2026           gcc_unreachable ();
2027         }
2028     }
2029
2030   if (!CONST_INT_P (imm))
2031     {
2032       if (GET_CODE (imm) == HIGH)
2033         emit_insn (gen_rtx_SET (dest, imm));
2034       else
2035         {
2036           rtx mem = force_const_mem (mode, imm);
2037           gcc_assert (mem);
2038           emit_insn (gen_rtx_SET (dest, mem));
2039         }
2040
2041       return;
2042     }
2043
2044   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2045 }
2046
2047 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2048    temporary value if necessary.  FRAME_RELATED_P should be true if
2049    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2050    to the generated instructions.  If SCRATCHREG is known to hold
2051    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2052    immediate again.
2053
2054    Since this function may be used to adjust the stack pointer, we must
2055    ensure that it cannot cause transient stack deallocation (for example
2056    by first incrementing SP and then decrementing when adjusting by a
2057    large immediate).  */
2058
2059 static void
2060 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2061                                HOST_WIDE_INT delta, bool frame_related_p,
2062                                bool emit_move_imm)
2063 {
2064   HOST_WIDE_INT mdelta = abs_hwi (delta);
2065   rtx this_rtx = gen_rtx_REG (mode, regnum);
2066   rtx_insn *insn;
2067
2068   if (!mdelta)
2069     return;
2070
2071   /* Single instruction adjustment.  */
2072   if (aarch64_uimm12_shift (mdelta))
2073     {
2074       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2075       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2076       return;
2077     }
2078
2079   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2080      Only do this if mdelta is not a 16-bit move as adjusting using a move
2081      is better.  */
2082   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2083     {
2084       HOST_WIDE_INT low_off = mdelta & 0xfff;
2085
2086       low_off = delta < 0 ? -low_off : low_off;
2087       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2088       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2089       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2090       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2091       return;
2092     }
2093
2094   /* Emit a move immediate if required and an addition/subtraction.  */
2095   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2096   if (emit_move_imm)
2097     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2098   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2099                               : gen_add2_insn (this_rtx, scratch_rtx));
2100   if (frame_related_p)
2101     {
2102       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2103       rtx adj = plus_constant (mode, this_rtx, delta);
2104       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2105     }
2106 }
2107
2108 static inline void
2109 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2110                       HOST_WIDE_INT delta)
2111 {
2112   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2113 }
2114
2115 static inline void
2116 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2117 {
2118   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2119                                  true, emit_move_imm);
2120 }
2121
2122 static inline void
2123 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2124 {
2125   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2126                                  frame_related_p, true);
2127 }
2128
2129 static bool
2130 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2131                                  tree exp ATTRIBUTE_UNUSED)
2132 {
2133   /* Currently, always true.  */
2134   return true;
2135 }
2136
2137 /* Implement TARGET_PASS_BY_REFERENCE.  */
2138
2139 static bool
2140 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2141                            machine_mode mode,
2142                            const_tree type,
2143                            bool named ATTRIBUTE_UNUSED)
2144 {
2145   HOST_WIDE_INT size;
2146   machine_mode dummymode;
2147   int nregs;
2148
2149   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2150   size = (mode == BLKmode && type)
2151     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2152
2153   /* Aggregates are passed by reference based on their size.  */
2154   if (type && AGGREGATE_TYPE_P (type))
2155     {
2156       size = int_size_in_bytes (type);
2157     }
2158
2159   /* Variable sized arguments are always returned by reference.  */
2160   if (size < 0)
2161     return true;
2162
2163   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2164   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2165                                                &dummymode, &nregs,
2166                                                NULL))
2167     return false;
2168
2169   /* Arguments which are variable sized or larger than 2 registers are
2170      passed by reference unless they are a homogenous floating point
2171      aggregate.  */
2172   return size > 2 * UNITS_PER_WORD;
2173 }
2174
2175 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2176 static bool
2177 aarch64_return_in_msb (const_tree valtype)
2178 {
2179   machine_mode dummy_mode;
2180   int dummy_int;
2181
2182   /* Never happens in little-endian mode.  */
2183   if (!BYTES_BIG_ENDIAN)
2184     return false;
2185
2186   /* Only composite types smaller than or equal to 16 bytes can
2187      be potentially returned in registers.  */
2188   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2189       || int_size_in_bytes (valtype) <= 0
2190       || int_size_in_bytes (valtype) > 16)
2191     return false;
2192
2193   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2194      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2195      is always passed/returned in the least significant bits of fp/simd
2196      register(s).  */
2197   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2198                                                &dummy_mode, &dummy_int, NULL))
2199     return false;
2200
2201   return true;
2202 }
2203
2204 /* Implement TARGET_FUNCTION_VALUE.
2205    Define how to find the value returned by a function.  */
2206
2207 static rtx
2208 aarch64_function_value (const_tree type, const_tree func,
2209                         bool outgoing ATTRIBUTE_UNUSED)
2210 {
2211   machine_mode mode;
2212   int unsignedp;
2213   int count;
2214   machine_mode ag_mode;
2215
2216   mode = TYPE_MODE (type);
2217   if (INTEGRAL_TYPE_P (type))
2218     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2219
2220   if (aarch64_return_in_msb (type))
2221     {
2222       HOST_WIDE_INT size = int_size_in_bytes (type);
2223
2224       if (size % UNITS_PER_WORD != 0)
2225         {
2226           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2227           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2228         }
2229     }
2230
2231   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2232                                                &ag_mode, &count, NULL))
2233     {
2234       if (!aarch64_composite_type_p (type, mode))
2235         {
2236           gcc_assert (count == 1 && mode == ag_mode);
2237           return gen_rtx_REG (mode, V0_REGNUM);
2238         }
2239       else
2240         {
2241           int i;
2242           rtx par;
2243
2244           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2245           for (i = 0; i < count; i++)
2246             {
2247               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2248               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2249                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2250               XVECEXP (par, 0, i) = tmp;
2251             }
2252           return par;
2253         }
2254     }
2255   else
2256     return gen_rtx_REG (mode, R0_REGNUM);
2257 }
2258
2259 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2260    Return true if REGNO is the number of a hard register in which the values
2261    of called function may come back.  */
2262
2263 static bool
2264 aarch64_function_value_regno_p (const unsigned int regno)
2265 {
2266   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2267      of 16-byte return values are: 128-bit integers and 16-byte small
2268      structures (excluding homogeneous floating-point aggregates).  */
2269   if (regno == R0_REGNUM || regno == R1_REGNUM)
2270     return true;
2271
2272   /* Up to four fp/simd registers can return a function value, e.g. a
2273      homogeneous floating-point aggregate having four members.  */
2274   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2275     return TARGET_FLOAT;
2276
2277   return false;
2278 }
2279
2280 /* Implement TARGET_RETURN_IN_MEMORY.
2281
2282    If the type T of the result of a function is such that
2283      void func (T arg)
2284    would require that arg be passed as a value in a register (or set of
2285    registers) according to the parameter passing rules, then the result
2286    is returned in the same registers as would be used for such an
2287    argument.  */
2288
2289 static bool
2290 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2291 {
2292   HOST_WIDE_INT size;
2293   machine_mode ag_mode;
2294   int count;
2295
2296   if (!AGGREGATE_TYPE_P (type)
2297       && TREE_CODE (type) != COMPLEX_TYPE
2298       && TREE_CODE (type) != VECTOR_TYPE)
2299     /* Simple scalar types always returned in registers.  */
2300     return false;
2301
2302   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2303                                                type,
2304                                                &ag_mode,
2305                                                &count,
2306                                                NULL))
2307     return false;
2308
2309   /* Types larger than 2 registers returned in memory.  */
2310   size = int_size_in_bytes (type);
2311   return (size < 0 || size > 2 * UNITS_PER_WORD);
2312 }
2313
2314 static bool
2315 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2316                                const_tree type, int *nregs)
2317 {
2318   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2319   return aarch64_vfp_is_call_or_return_candidate (mode,
2320                                                   type,
2321                                                   &pcum->aapcs_vfp_rmode,
2322                                                   nregs,
2323                                                   NULL);
2324 }
2325
2326 /* Given MODE and TYPE of a function argument, return the alignment in
2327    bits.  The idea is to suppress any stronger alignment requested by
2328    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2329    This is a helper function for local use only.  */
2330
2331 static unsigned int
2332 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2333 {
2334   if (!type)
2335     return GET_MODE_ALIGNMENT (mode);
2336
2337   if (integer_zerop (TYPE_SIZE (type)))
2338     return 0;
2339
2340   gcc_assert (TYPE_MODE (type) == mode);
2341
2342   if (!AGGREGATE_TYPE_P (type))
2343     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2344
2345   if (TREE_CODE (type) == ARRAY_TYPE)
2346     return TYPE_ALIGN (TREE_TYPE (type));
2347
2348   unsigned int alignment = 0;
2349   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2350     if (TREE_CODE (field) == FIELD_DECL)
2351       alignment = std::max (alignment, DECL_ALIGN (field));
2352
2353   return alignment;
2354 }
2355
2356 /* Layout a function argument according to the AAPCS64 rules.  The rule
2357    numbers refer to the rule numbers in the AAPCS64.  */
2358
2359 static void
2360 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2361                     const_tree type,
2362                     bool named ATTRIBUTE_UNUSED)
2363 {
2364   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2365   int ncrn, nvrn, nregs;
2366   bool allocate_ncrn, allocate_nvrn;
2367   HOST_WIDE_INT size;
2368
2369   /* We need to do this once per argument.  */
2370   if (pcum->aapcs_arg_processed)
2371     return;
2372
2373   pcum->aapcs_arg_processed = true;
2374
2375   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2376   size
2377     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2378                 UNITS_PER_WORD);
2379
2380   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2381   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2382                                                  mode,
2383                                                  type,
2384                                                  &nregs);
2385
2386   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2387      The following code thus handles passing by SIMD/FP registers first.  */
2388
2389   nvrn = pcum->aapcs_nvrn;
2390
2391   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2392      and homogenous short-vector aggregates (HVA).  */
2393   if (allocate_nvrn)
2394     {
2395       if (!TARGET_FLOAT)
2396         aarch64_err_no_fpadvsimd (mode, "argument");
2397
2398       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2399         {
2400           pcum->aapcs_nextnvrn = nvrn + nregs;
2401           if (!aarch64_composite_type_p (type, mode))
2402             {
2403               gcc_assert (nregs == 1);
2404               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2405             }
2406           else
2407             {
2408               rtx par;
2409               int i;
2410               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2411               for (i = 0; i < nregs; i++)
2412                 {
2413                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2414                                          V0_REGNUM + nvrn + i);
2415                   tmp = gen_rtx_EXPR_LIST
2416                     (VOIDmode, tmp,
2417                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2418                   XVECEXP (par, 0, i) = tmp;
2419                 }
2420               pcum->aapcs_reg = par;
2421             }
2422           return;
2423         }
2424       else
2425         {
2426           /* C.3 NSRN is set to 8.  */
2427           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2428           goto on_stack;
2429         }
2430     }
2431
2432   ncrn = pcum->aapcs_ncrn;
2433   nregs = size / UNITS_PER_WORD;
2434
2435   /* C6 - C9.  though the sign and zero extension semantics are
2436      handled elsewhere.  This is the case where the argument fits
2437      entirely general registers.  */
2438   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2439     {
2440
2441       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2442
2443       /* C.8 if the argument has an alignment of 16 then the NGRN is
2444          rounded up to the next even number.  */
2445       if (nregs == 2
2446           && ncrn % 2
2447           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2448              comparison is there because for > 16 * BITS_PER_UNIT
2449              alignment nregs should be > 2 and therefore it should be
2450              passed by reference rather than value.  */
2451           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2452         {
2453           ++ncrn;
2454           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2455         }
2456
2457       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2458          A reg is still generated for it, but the caller should be smart
2459          enough not to use it.  */
2460       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2461         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2462       else
2463         {
2464           rtx par;
2465           int i;
2466
2467           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2468           for (i = 0; i < nregs; i++)
2469             {
2470               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2471               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2472                                        GEN_INT (i * UNITS_PER_WORD));
2473               XVECEXP (par, 0, i) = tmp;
2474             }
2475           pcum->aapcs_reg = par;
2476         }
2477
2478       pcum->aapcs_nextncrn = ncrn + nregs;
2479       return;
2480     }
2481
2482   /* C.11  */
2483   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2484
2485   /* The argument is passed on stack; record the needed number of words for
2486      this argument and align the total size if necessary.  */
2487 on_stack:
2488   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2489
2490   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2491     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2492                                        16 / UNITS_PER_WORD);
2493   return;
2494 }
2495
2496 /* Implement TARGET_FUNCTION_ARG.  */
2497
2498 static rtx
2499 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2500                       const_tree type, bool named)
2501 {
2502   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2503   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2504
2505   if (mode == VOIDmode)
2506     return NULL_RTX;
2507
2508   aarch64_layout_arg (pcum_v, mode, type, named);
2509   return pcum->aapcs_reg;
2510 }
2511
2512 void
2513 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2514                            const_tree fntype ATTRIBUTE_UNUSED,
2515                            rtx libname ATTRIBUTE_UNUSED,
2516                            const_tree fndecl ATTRIBUTE_UNUSED,
2517                            unsigned n_named ATTRIBUTE_UNUSED)
2518 {
2519   pcum->aapcs_ncrn = 0;
2520   pcum->aapcs_nvrn = 0;
2521   pcum->aapcs_nextncrn = 0;
2522   pcum->aapcs_nextnvrn = 0;
2523   pcum->pcs_variant = ARM_PCS_AAPCS64;
2524   pcum->aapcs_reg = NULL_RTX;
2525   pcum->aapcs_arg_processed = false;
2526   pcum->aapcs_stack_words = 0;
2527   pcum->aapcs_stack_size = 0;
2528
2529   if (!TARGET_FLOAT
2530       && fndecl && TREE_PUBLIC (fndecl)
2531       && fntype && fntype != error_mark_node)
2532     {
2533       const_tree type = TREE_TYPE (fntype);
2534       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2535       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2536       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2537                                                    &mode, &nregs, NULL))
2538         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2539     }
2540   return;
2541 }
2542
2543 static void
2544 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2545                               machine_mode mode,
2546                               const_tree type,
2547                               bool named)
2548 {
2549   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2550   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2551     {
2552       aarch64_layout_arg (pcum_v, mode, type, named);
2553       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2554                   != (pcum->aapcs_stack_words != 0));
2555       pcum->aapcs_arg_processed = false;
2556       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2557       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2558       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2559       pcum->aapcs_stack_words = 0;
2560       pcum->aapcs_reg = NULL_RTX;
2561     }
2562 }
2563
2564 bool
2565 aarch64_function_arg_regno_p (unsigned regno)
2566 {
2567   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2568           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2569 }
2570
2571 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2572    PARM_BOUNDARY bits of alignment, but will be given anything up
2573    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2574    that both before and after the layout of each argument, the Next
2575    Stacked Argument Address (NSAA) will have a minimum alignment of
2576    8 bytes.  */
2577
2578 static unsigned int
2579 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2580 {
2581   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2582   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2583 }
2584
2585 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2586
2587    Return true if an argument passed on the stack should be padded upwards,
2588    i.e. if the least-significant byte of the stack slot has useful data.
2589
2590    Small aggregate types are placed in the lowest memory address.
2591
2592    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2593
2594 bool
2595 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2596 {
2597   /* On little-endian targets, the least significant byte of every stack
2598      argument is passed at the lowest byte address of the stack slot.  */
2599   if (!BYTES_BIG_ENDIAN)
2600     return true;
2601
2602   /* Otherwise, integral, floating-point and pointer types are padded downward:
2603      the least significant byte of a stack argument is passed at the highest
2604      byte address of the stack slot.  */
2605   if (type
2606       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2607          || POINTER_TYPE_P (type))
2608       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2609     return false;
2610
2611   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2612   return true;
2613 }
2614
2615 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2616
2617    It specifies padding for the last (may also be the only)
2618    element of a block move between registers and memory.  If
2619    assuming the block is in the memory, padding upward means that
2620    the last element is padded after its highest significant byte,
2621    while in downward padding, the last element is padded at the
2622    its least significant byte side.
2623
2624    Small aggregates and small complex types are always padded
2625    upwards.
2626
2627    We don't need to worry about homogeneous floating-point or
2628    short-vector aggregates; their move is not affected by the
2629    padding direction determined here.  Regardless of endianness,
2630    each element of such an aggregate is put in the least
2631    significant bits of a fp/simd register.
2632
2633    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2634    register has useful data, and return the opposite if the most
2635    significant byte does.  */
2636
2637 bool
2638 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2639                      bool first ATTRIBUTE_UNUSED)
2640 {
2641
2642   /* Small composite types are always padded upward.  */
2643   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2644     {
2645       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2646                             : GET_MODE_SIZE (mode));
2647       if (size < 2 * UNITS_PER_WORD)
2648         return true;
2649     }
2650
2651   /* Otherwise, use the default padding.  */
2652   return !BYTES_BIG_ENDIAN;
2653 }
2654
2655 static machine_mode
2656 aarch64_libgcc_cmp_return_mode (void)
2657 {
2658   return SImode;
2659 }
2660
2661 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2662
2663 /* We use the 12-bit shifted immediate arithmetic instructions so values
2664    must be multiple of (1 << 12), i.e. 4096.  */
2665 #define ARITH_FACTOR 4096
2666
2667 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2668 #error Cannot use simple address calculation for stack probing
2669 #endif
2670
2671 /* The pair of scratch registers used for stack probing.  */
2672 #define PROBE_STACK_FIRST_REG  9
2673 #define PROBE_STACK_SECOND_REG 10
2674
2675 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2676    inclusive.  These are offsets from the current stack pointer.  */
2677
2678 static void
2679 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2680 {
2681   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2682
2683   /* See the same assertion on PROBE_INTERVAL above.  */
2684   gcc_assert ((first % ARITH_FACTOR) == 0);
2685
2686   /* See if we have a constant small number of probes to generate.  If so,
2687      that's the easy case.  */
2688   if (size <= PROBE_INTERVAL)
2689     {
2690       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2691
2692       emit_set_insn (reg1,
2693                      plus_constant (Pmode,
2694                                     stack_pointer_rtx, -(first + base)));
2695       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2696     }
2697
2698   /* The run-time loop is made up of 8 insns in the generic case while the
2699      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2700   else if (size <= 4 * PROBE_INTERVAL)
2701     {
2702       HOST_WIDE_INT i, rem;
2703
2704       emit_set_insn (reg1,
2705                      plus_constant (Pmode,
2706                                     stack_pointer_rtx,
2707                                     -(first + PROBE_INTERVAL)));
2708       emit_stack_probe (reg1);
2709
2710       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2711          it exceeds SIZE.  If only two probes are needed, this will not
2712          generate any code.  Then probe at FIRST + SIZE.  */
2713       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2714         {
2715           emit_set_insn (reg1,
2716                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2717           emit_stack_probe (reg1);
2718         }
2719
2720       rem = size - (i - PROBE_INTERVAL);
2721       if (rem > 256)
2722         {
2723           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2724
2725           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2726           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2727         }
2728       else
2729         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2730     }
2731
2732   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2733      extra careful with variables wrapping around because we might be at
2734      the very top (or the very bottom) of the address space and we have
2735      to be able to handle this case properly; in particular, we use an
2736      equality test for the loop condition.  */
2737   else
2738     {
2739       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2740
2741       /* Step 1: round SIZE to the previous multiple of the interval.  */
2742
2743       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2744
2745
2746       /* Step 2: compute initial and final value of the loop counter.  */
2747
2748       /* TEST_ADDR = SP + FIRST.  */
2749       emit_set_insn (reg1,
2750                      plus_constant (Pmode, stack_pointer_rtx, -first));
2751
2752       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2753       HOST_WIDE_INT adjustment = - (first + rounded_size);
2754       if (! aarch64_uimm12_shift (adjustment))
2755         {
2756           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2757                                           true, Pmode);
2758           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2759         }
2760       else
2761         {
2762           emit_set_insn (reg2,
2763                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2764         }
2765
2766       /* Step 3: the loop
2767
2768          do
2769            {
2770              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2771              probe at TEST_ADDR
2772            }
2773          while (TEST_ADDR != LAST_ADDR)
2774
2775          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2776          until it is equal to ROUNDED_SIZE.  */
2777
2778       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2779
2780
2781       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2782          that SIZE is equal to ROUNDED_SIZE.  */
2783
2784       if (size != rounded_size)
2785         {
2786           HOST_WIDE_INT rem = size - rounded_size;
2787
2788           if (rem > 256)
2789             {
2790               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2791
2792               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2793               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2794             }
2795           else
2796             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2797         }
2798     }
2799
2800   /* Make sure nothing is scheduled before we are done.  */
2801   emit_insn (gen_blockage ());
2802 }
2803
2804 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2805    absolute addresses.  */
2806
2807 const char *
2808 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2809 {
2810   static int labelno = 0;
2811   char loop_lab[32];
2812   rtx xops[2];
2813
2814   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2815
2816   /* Loop.  */
2817   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2818
2819   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2820   xops[0] = reg1;
2821   xops[1] = GEN_INT (PROBE_INTERVAL);
2822   output_asm_insn ("sub\t%0, %0, %1", xops);
2823
2824   /* Probe at TEST_ADDR.  */
2825   output_asm_insn ("str\txzr, [%0]", xops);
2826
2827   /* Test if TEST_ADDR == LAST_ADDR.  */
2828   xops[1] = reg2;
2829   output_asm_insn ("cmp\t%0, %1", xops);
2830
2831   /* Branch.  */
2832   fputs ("\tb.ne\t", asm_out_file);
2833   assemble_name_raw (asm_out_file, loop_lab);
2834   fputc ('\n', asm_out_file);
2835
2836   return "";
2837 }
2838
2839 static bool
2840 aarch64_frame_pointer_required (void)
2841 {
2842   /* In aarch64_override_options_after_change
2843      flag_omit_leaf_frame_pointer turns off the frame pointer by
2844      default.  Turn it back on now if we've not got a leaf
2845      function.  */
2846   if (flag_omit_leaf_frame_pointer
2847       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2848     return true;
2849
2850   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2851   if (crtl->calls_eh_return)
2852     return true;
2853
2854   return false;
2855 }
2856
2857 /* Mark the registers that need to be saved by the callee and calculate
2858    the size of the callee-saved registers area and frame record (both FP
2859    and LR may be omitted).  */
2860 static void
2861 aarch64_layout_frame (void)
2862 {
2863   HOST_WIDE_INT offset = 0;
2864   int regno, last_fp_reg = INVALID_REGNUM;
2865
2866   if (reload_completed && cfun->machine->frame.laid_out)
2867     return;
2868
2869 #define SLOT_NOT_REQUIRED (-2)
2870 #define SLOT_REQUIRED     (-1)
2871
2872   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2873   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2874
2875   /* First mark all the registers that really need to be saved...  */
2876   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2877     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2878
2879   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2880     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2881
2882   /* ... that includes the eh data registers (if needed)...  */
2883   if (crtl->calls_eh_return)
2884     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2885       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2886         = SLOT_REQUIRED;
2887
2888   /* ... and any callee saved register that dataflow says is live.  */
2889   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2890     if (df_regs_ever_live_p (regno)
2891         && (regno == R30_REGNUM
2892             || !call_used_regs[regno]))
2893       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2894
2895   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2896     if (df_regs_ever_live_p (regno)
2897         && !call_used_regs[regno])
2898       {
2899         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2900         last_fp_reg = regno;
2901       }
2902
2903   if (frame_pointer_needed)
2904     {
2905       /* FP and LR are placed in the linkage record.  */
2906       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2907       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2908       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2909       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2910       offset += 2 * UNITS_PER_WORD;
2911     }
2912
2913   /* Now assign stack slots for them.  */
2914   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2915     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2916       {
2917         cfun->machine->frame.reg_offset[regno] = offset;
2918         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2919           cfun->machine->frame.wb_candidate1 = regno;
2920         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2921           cfun->machine->frame.wb_candidate2 = regno;
2922         offset += UNITS_PER_WORD;
2923       }
2924
2925   HOST_WIDE_INT max_int_offset = offset;
2926   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2927   bool has_align_gap = offset != max_int_offset;
2928
2929   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2930     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2931       {
2932         /* If there is an alignment gap between integer and fp callee-saves,
2933            allocate the last fp register to it if possible.  */
2934         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2935           {
2936             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2937             break;
2938           }
2939
2940         cfun->machine->frame.reg_offset[regno] = offset;
2941         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2942           cfun->machine->frame.wb_candidate1 = regno;
2943         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2944                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2945           cfun->machine->frame.wb_candidate2 = regno;
2946         offset += UNITS_PER_WORD;
2947       }
2948
2949   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2950
2951   cfun->machine->frame.saved_regs_size = offset;
2952
2953   HOST_WIDE_INT varargs_and_saved_regs_size
2954     = offset + cfun->machine->frame.saved_varargs_size;
2955
2956   cfun->machine->frame.hard_fp_offset
2957     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2958                 STACK_BOUNDARY / BITS_PER_UNIT);
2959
2960   cfun->machine->frame.frame_size
2961     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2962                 + crtl->outgoing_args_size,
2963                 STACK_BOUNDARY / BITS_PER_UNIT);
2964
2965   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2966
2967   cfun->machine->frame.initial_adjust = 0;
2968   cfun->machine->frame.final_adjust = 0;
2969   cfun->machine->frame.callee_adjust = 0;
2970   cfun->machine->frame.callee_offset = 0;
2971
2972   HOST_WIDE_INT max_push_offset = 0;
2973   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2974     max_push_offset = 512;
2975   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2976     max_push_offset = 256;
2977
2978   if (cfun->machine->frame.frame_size < max_push_offset
2979       && crtl->outgoing_args_size == 0)
2980     {
2981       /* Simple, small frame with no outgoing arguments:
2982          stp reg1, reg2, [sp, -frame_size]!
2983          stp reg3, reg4, [sp, 16]  */
2984       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2985     }
2986   else if ((crtl->outgoing_args_size
2987             + cfun->machine->frame.saved_regs_size < 512)
2988            && !(cfun->calls_alloca
2989                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2990     {
2991       /* Frame with small outgoing arguments:
2992          sub sp, sp, frame_size
2993          stp reg1, reg2, [sp, outgoing_args_size]
2994          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2995       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2996       cfun->machine->frame.callee_offset
2997         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2998     }
2999   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3000     {
3001       /* Frame with large outgoing arguments but a small local area:
3002          stp reg1, reg2, [sp, -hard_fp_offset]!
3003          stp reg3, reg4, [sp, 16]
3004          sub sp, sp, outgoing_args_size  */
3005       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3006       cfun->machine->frame.final_adjust
3007         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3008     }
3009   else if (!frame_pointer_needed
3010            && varargs_and_saved_regs_size < max_push_offset)
3011     {
3012       /* Frame with large local area and outgoing arguments (this pushes the
3013          callee-saves first, followed by the locals and outgoing area):
3014          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3015          stp reg3, reg4, [sp, 16]
3016          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3017       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3018       cfun->machine->frame.final_adjust
3019         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3020       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3021       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3022     }
3023   else
3024     {
3025       /* Frame with large local area and outgoing arguments using frame pointer:
3026          sub sp, sp, hard_fp_offset
3027          stp x29, x30, [sp, 0]
3028          add x29, sp, 0
3029          stp reg3, reg4, [sp, 16]
3030          sub sp, sp, outgoing_args_size  */
3031       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3032       cfun->machine->frame.final_adjust
3033         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3034     }
3035
3036   cfun->machine->frame.laid_out = true;
3037 }
3038
3039 /* Return true if the register REGNO is saved on entry to
3040    the current function.  */
3041
3042 static bool
3043 aarch64_register_saved_on_entry (int regno)
3044 {
3045   return cfun->machine->frame.reg_offset[regno] >= 0;
3046 }
3047
3048 /* Return the next register up from REGNO up to LIMIT for the callee
3049    to save.  */
3050
3051 static unsigned
3052 aarch64_next_callee_save (unsigned regno, unsigned limit)
3053 {
3054   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3055     regno ++;
3056   return regno;
3057 }
3058
3059 /* Push the register number REGNO of mode MODE to the stack with write-back
3060    adjusting the stack by ADJUSTMENT.  */
3061
3062 static void
3063 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3064                            HOST_WIDE_INT adjustment)
3065  {
3066   rtx base_rtx = stack_pointer_rtx;
3067   rtx insn, reg, mem;
3068
3069   reg = gen_rtx_REG (mode, regno);
3070   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3071                             plus_constant (Pmode, base_rtx, -adjustment));
3072   mem = gen_rtx_MEM (mode, mem);
3073
3074   insn = emit_move_insn (mem, reg);
3075   RTX_FRAME_RELATED_P (insn) = 1;
3076 }
3077
3078 /* Generate and return an instruction to store the pair of registers
3079    REG and REG2 of mode MODE to location BASE with write-back adjusting
3080    the stack location BASE by ADJUSTMENT.  */
3081
3082 static rtx
3083 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3084                           HOST_WIDE_INT adjustment)
3085 {
3086   switch (mode)
3087     {
3088     case DImode:
3089       return gen_storewb_pairdi_di (base, base, reg, reg2,
3090                                     GEN_INT (-adjustment),
3091                                     GEN_INT (UNITS_PER_WORD - adjustment));
3092     case DFmode:
3093       return gen_storewb_pairdf_di (base, base, reg, reg2,
3094                                     GEN_INT (-adjustment),
3095                                     GEN_INT (UNITS_PER_WORD - adjustment));
3096     default:
3097       gcc_unreachable ();
3098     }
3099 }
3100
3101 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3102    stack pointer by ADJUSTMENT.  */
3103
3104 static void
3105 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3106 {
3107   rtx_insn *insn;
3108   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3109
3110   if (regno2 == INVALID_REGNUM)
3111     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3112
3113   rtx reg1 = gen_rtx_REG (mode, regno1);
3114   rtx reg2 = gen_rtx_REG (mode, regno2);
3115
3116   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3117                                               reg2, adjustment));
3118   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3119   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3120   RTX_FRAME_RELATED_P (insn) = 1;
3121 }
3122
3123 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3124    adjusting it by ADJUSTMENT afterwards.  */
3125
3126 static rtx
3127 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3128                          HOST_WIDE_INT adjustment)
3129 {
3130   switch (mode)
3131     {
3132     case DImode:
3133       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3134                                    GEN_INT (UNITS_PER_WORD));
3135     case DFmode:
3136       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3137                                    GEN_INT (UNITS_PER_WORD));
3138     default:
3139       gcc_unreachable ();
3140     }
3141 }
3142
3143 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3144    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3145    into CFI_OPS.  */
3146
3147 static void
3148 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3149                   rtx *cfi_ops)
3150 {
3151   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3152   rtx reg1 = gen_rtx_REG (mode, regno1);
3153
3154   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3155
3156   if (regno2 == INVALID_REGNUM)
3157     {
3158       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3159       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3160       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3161     }
3162   else
3163     {
3164       rtx reg2 = gen_rtx_REG (mode, regno2);
3165       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3166       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3167                                           reg2, adjustment));
3168     }
3169 }
3170
3171 /* Generate and return a store pair instruction of mode MODE to store
3172    register REG1 to MEM1 and register REG2 to MEM2.  */
3173
3174 static rtx
3175 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3176                         rtx reg2)
3177 {
3178   switch (mode)
3179     {
3180     case DImode:
3181       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3182
3183     case DFmode:
3184       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3185
3186     default:
3187       gcc_unreachable ();
3188     }
3189 }
3190
3191 /* Generate and regurn a load pair isntruction of mode MODE to load register
3192    REG1 from MEM1 and register REG2 from MEM2.  */
3193
3194 static rtx
3195 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3196                        rtx mem2)
3197 {
3198   switch (mode)
3199     {
3200     case DImode:
3201       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3202
3203     case DFmode:
3204       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3205
3206     default:
3207       gcc_unreachable ();
3208     }
3209 }
3210
3211 /* Return TRUE if return address signing should be enabled for the current
3212    function, otherwise return FALSE.  */
3213
3214 bool
3215 aarch64_return_address_signing_enabled (void)
3216 {
3217   /* This function should only be called after frame laid out.   */
3218   gcc_assert (cfun->machine->frame.laid_out);
3219
3220   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3221      if it's LR is pushed onto stack.  */
3222   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3223           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3224               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3225 }
3226
3227 /* Emit code to save the callee-saved registers from register number START
3228    to LIMIT to the stack at the location starting at offset START_OFFSET,
3229    skipping any write-back candidates if SKIP_WB is true.  */
3230
3231 static void
3232 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3233                            unsigned start, unsigned limit, bool skip_wb)
3234 {
3235   rtx_insn *insn;
3236   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3237                                                  ? gen_frame_mem : gen_rtx_MEM);
3238   unsigned regno;
3239   unsigned regno2;
3240
3241   for (regno = aarch64_next_callee_save (start, limit);
3242        regno <= limit;
3243        regno = aarch64_next_callee_save (regno + 1, limit))
3244     {
3245       rtx reg, mem;
3246       HOST_WIDE_INT offset;
3247
3248       if (skip_wb
3249           && (regno == cfun->machine->frame.wb_candidate1
3250               || regno == cfun->machine->frame.wb_candidate2))
3251         continue;
3252
3253       if (cfun->machine->reg_is_wrapped_separately[regno])
3254        continue;
3255
3256       reg = gen_rtx_REG (mode, regno);
3257       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3258       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3259                                               offset));
3260
3261       regno2 = aarch64_next_callee_save (regno + 1, limit);
3262
3263       if (regno2 <= limit
3264           && !cfun->machine->reg_is_wrapped_separately[regno2]
3265           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3266               == cfun->machine->frame.reg_offset[regno2]))
3267
3268         {
3269           rtx reg2 = gen_rtx_REG (mode, regno2);
3270           rtx mem2;
3271
3272           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3273           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3274                                                    offset));
3275           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3276                                                     reg2));
3277
3278           /* The first part of a frame-related parallel insn is
3279              always assumed to be relevant to the frame
3280              calculations; subsequent parts, are only
3281              frame-related if explicitly marked.  */
3282           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3283           regno = regno2;
3284         }
3285       else
3286         insn = emit_move_insn (mem, reg);
3287
3288       RTX_FRAME_RELATED_P (insn) = 1;
3289     }
3290 }
3291
3292 /* Emit code to restore the callee registers of mode MODE from register
3293    number START up to and including LIMIT.  Restore from the stack offset
3294    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3295    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3296
3297 static void
3298 aarch64_restore_callee_saves (machine_mode mode,
3299                               HOST_WIDE_INT start_offset, unsigned start,
3300                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3301 {
3302   rtx base_rtx = stack_pointer_rtx;
3303   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3304                                                  ? gen_frame_mem : gen_rtx_MEM);
3305   unsigned regno;
3306   unsigned regno2;
3307   HOST_WIDE_INT offset;
3308
3309   for (regno = aarch64_next_callee_save (start, limit);
3310        regno <= limit;
3311        regno = aarch64_next_callee_save (regno + 1, limit))
3312     {
3313       if (cfun->machine->reg_is_wrapped_separately[regno])
3314        continue;
3315
3316       rtx reg, mem;
3317
3318       if (skip_wb
3319           && (regno == cfun->machine->frame.wb_candidate1
3320               || regno == cfun->machine->frame.wb_candidate2))
3321         continue;
3322
3323       reg = gen_rtx_REG (mode, regno);
3324       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3325       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3326
3327       regno2 = aarch64_next_callee_save (regno + 1, limit);
3328
3329       if (regno2 <= limit
3330           && !cfun->machine->reg_is_wrapped_separately[regno2]
3331           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3332               == cfun->machine->frame.reg_offset[regno2]))
3333         {
3334           rtx reg2 = gen_rtx_REG (mode, regno2);
3335           rtx mem2;
3336
3337           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3338           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3339           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3340
3341           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3342           regno = regno2;
3343         }
3344       else
3345         emit_move_insn (reg, mem);
3346       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3347     }
3348 }
3349
3350 static inline bool
3351 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3352                                HOST_WIDE_INT offset)
3353 {
3354   return offset >= -256 && offset < 256;
3355 }
3356
3357 static inline bool
3358 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3359 {
3360   return (offset >= 0
3361           && offset < 4096 * GET_MODE_SIZE (mode)
3362           && offset % GET_MODE_SIZE (mode) == 0);
3363 }
3364
3365 bool
3366 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3367 {
3368   return (offset >= -64 * GET_MODE_SIZE (mode)
3369           && offset < 64 * GET_MODE_SIZE (mode)
3370           && offset % GET_MODE_SIZE (mode) == 0);
3371 }
3372
3373 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3374
3375 static sbitmap
3376 aarch64_get_separate_components (void)
3377 {
3378   aarch64_layout_frame ();
3379
3380   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3381   bitmap_clear (components);
3382
3383   /* The registers we need saved to the frame.  */
3384   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3385     if (aarch64_register_saved_on_entry (regno))
3386       {
3387         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3388         if (!frame_pointer_needed)
3389           offset += cfun->machine->frame.frame_size
3390                     - cfun->machine->frame.hard_fp_offset;
3391         /* Check that we can access the stack slot of the register with one
3392            direct load with no adjustments needed.  */
3393         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3394           bitmap_set_bit (components, regno);
3395       }
3396
3397   /* Don't mess with the hard frame pointer.  */
3398   if (frame_pointer_needed)
3399     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3400
3401   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3402   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3403   /* If aarch64_layout_frame has chosen registers to store/restore with
3404      writeback don't interfere with them to avoid having to output explicit
3405      stack adjustment instructions.  */
3406   if (reg2 != INVALID_REGNUM)
3407     bitmap_clear_bit (components, reg2);
3408   if (reg1 != INVALID_REGNUM)
3409     bitmap_clear_bit (components, reg1);
3410
3411   bitmap_clear_bit (components, LR_REGNUM);
3412   bitmap_clear_bit (components, SP_REGNUM);
3413
3414   return components;
3415 }
3416
3417 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3418
3419 static sbitmap
3420 aarch64_components_for_bb (basic_block bb)
3421 {
3422   bitmap in = DF_LIVE_IN (bb);
3423   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3424   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3425
3426   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3427   bitmap_clear (components);
3428
3429   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3430   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3431     if ((!call_used_regs[regno])
3432        && (bitmap_bit_p (in, regno)
3433            || bitmap_bit_p (gen, regno)
3434            || bitmap_bit_p (kill, regno)))
3435           bitmap_set_bit (components, regno);
3436
3437   return components;
3438 }
3439
3440 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3441    Nothing to do for aarch64.  */
3442
3443 static void
3444 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3445 {
3446 }
3447
3448 /* Return the next set bit in BMP from START onwards.  Return the total number
3449    of bits in BMP if no set bit is found at or after START.  */
3450
3451 static unsigned int
3452 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3453 {
3454   unsigned int nbits = SBITMAP_SIZE (bmp);
3455   if (start == nbits)
3456     return start;
3457
3458   gcc_assert (start < nbits);
3459   for (unsigned int i = start; i < nbits; i++)
3460     if (bitmap_bit_p (bmp, i))
3461       return i;
3462
3463   return nbits;
3464 }
3465
3466 /* Do the work for aarch64_emit_prologue_components and
3467    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3468    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3469    for these components or the epilogue sequence.  That is, it determines
3470    whether we should emit stores or loads and what kind of CFA notes to attach
3471    to the insns.  Otherwise the logic for the two sequences is very
3472    similar.  */
3473
3474 static void
3475 aarch64_process_components (sbitmap components, bool prologue_p)
3476 {
3477   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3478                              ? HARD_FRAME_POINTER_REGNUM
3479                              : STACK_POINTER_REGNUM);
3480
3481   unsigned last_regno = SBITMAP_SIZE (components);
3482   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3483   rtx_insn *insn = NULL;
3484
3485   while (regno != last_regno)
3486     {
3487       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3488          so DFmode for the vector registers is enough.  */
3489       machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3490       rtx reg = gen_rtx_REG (mode, regno);
3491       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3492       if (!frame_pointer_needed)
3493         offset += cfun->machine->frame.frame_size
3494                   - cfun->machine->frame.hard_fp_offset;
3495       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3496       rtx mem = gen_frame_mem (mode, addr);
3497
3498       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3499       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3500       /* No more registers to handle after REGNO.
3501          Emit a single save/restore and exit.  */
3502       if (regno2 == last_regno)
3503         {
3504           insn = emit_insn (set);
3505           RTX_FRAME_RELATED_P (insn) = 1;
3506           if (prologue_p)
3507             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3508           else
3509             add_reg_note (insn, REG_CFA_RESTORE, reg);
3510           break;
3511         }
3512
3513       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3514       /* The next register is not of the same class or its offset is not
3515          mergeable with the current one into a pair.  */
3516       if (!satisfies_constraint_Ump (mem)
3517           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3518           || (offset2 - cfun->machine->frame.reg_offset[regno])
3519                 != GET_MODE_SIZE (mode))
3520         {
3521           insn = emit_insn (set);
3522           RTX_FRAME_RELATED_P (insn) = 1;
3523           if (prologue_p)
3524             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3525           else
3526             add_reg_note (insn, REG_CFA_RESTORE, reg);
3527
3528           regno = regno2;
3529           continue;
3530         }
3531
3532       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3533       rtx reg2 = gen_rtx_REG (mode, regno2);
3534       if (!frame_pointer_needed)
3535         offset2 += cfun->machine->frame.frame_size
3536                   - cfun->machine->frame.hard_fp_offset;
3537       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3538       rtx mem2 = gen_frame_mem (mode, addr2);
3539       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3540                              : gen_rtx_SET (reg2, mem2);
3541
3542       if (prologue_p)
3543         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3544       else
3545         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3546
3547       RTX_FRAME_RELATED_P (insn) = 1;
3548       if (prologue_p)
3549         {
3550           add_reg_note (insn, REG_CFA_OFFSET, set);
3551           add_reg_note (insn, REG_CFA_OFFSET, set2);
3552         }
3553       else
3554         {
3555           add_reg_note (insn, REG_CFA_RESTORE, reg);
3556           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3557         }
3558
3559       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3560     }
3561 }
3562
3563 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3564
3565 static void
3566 aarch64_emit_prologue_components (sbitmap components)
3567 {
3568   aarch64_process_components (components, true);
3569 }
3570
3571 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3572
3573 static void
3574 aarch64_emit_epilogue_components (sbitmap components)
3575 {
3576   aarch64_process_components (components, false);
3577 }
3578
3579 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3580
3581 static void
3582 aarch64_set_handled_components (sbitmap components)
3583 {
3584   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3585     if (bitmap_bit_p (components, regno))
3586       cfun->machine->reg_is_wrapped_separately[regno] = true;
3587 }
3588
3589 /* AArch64 stack frames generated by this compiler look like:
3590
3591         +-------------------------------+
3592         |                               |
3593         |  incoming stack arguments     |
3594         |                               |
3595         +-------------------------------+
3596         |                               | <-- incoming stack pointer (aligned)
3597         |  callee-allocated save area   |
3598         |  for register varargs         |
3599         |                               |
3600         +-------------------------------+
3601         |  local variables              | <-- frame_pointer_rtx
3602         |                               |
3603         +-------------------------------+
3604         |  padding0                     | \
3605         +-------------------------------+  |
3606         |  callee-saved registers       |  | frame.saved_regs_size
3607         +-------------------------------+  |
3608         |  LR'                          |  |
3609         +-------------------------------+  |
3610         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3611         +-------------------------------+
3612         |  dynamic allocation           |
3613         +-------------------------------+
3614         |  padding                      |
3615         +-------------------------------+
3616         |  outgoing stack arguments     | <-- arg_pointer
3617         |                               |
3618         +-------------------------------+
3619         |                               | <-- stack_pointer_rtx (aligned)
3620
3621    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3622    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3623    unchanged.  */
3624
3625 /* Generate the prologue instructions for entry into a function.
3626    Establish the stack frame by decreasing the stack pointer with a
3627    properly calculated size and, if necessary, create a frame record
3628    filled with the values of LR and previous frame pointer.  The
3629    current FP is also set up if it is in use.  */
3630
3631 void
3632 aarch64_expand_prologue (void)
3633 {
3634   aarch64_layout_frame ();
3635
3636   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3637   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3638   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3639   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3640   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3641   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3642   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3643   rtx_insn *insn;
3644
3645   /* Sign return address for functions.  */
3646   if (aarch64_return_address_signing_enabled ())
3647     {
3648       insn = emit_insn (gen_pacisp ());
3649       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3650       RTX_FRAME_RELATED_P (insn) = 1;
3651     }
3652
3653   if (flag_stack_usage_info)
3654     current_function_static_stack_size = frame_size;
3655
3656   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3657     {
3658       if (crtl->is_leaf && !cfun->calls_alloca)
3659         {
3660           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3661             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3662                                             frame_size - STACK_CHECK_PROTECT);
3663         }
3664       else if (frame_size > 0)
3665         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3666     }
3667
3668   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3669
3670   if (callee_adjust != 0)
3671     aarch64_push_regs (reg1, reg2, callee_adjust);
3672
3673   if (frame_pointer_needed)
3674     {
3675       if (callee_adjust == 0)
3676         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3677                                    R30_REGNUM, false);
3678       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3679                                        stack_pointer_rtx,
3680                                        GEN_INT (callee_offset)));
3681       RTX_FRAME_RELATED_P (insn) = 1;
3682       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3683     }
3684
3685   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3686                              callee_adjust != 0 || frame_pointer_needed);
3687   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3688                              callee_adjust != 0 || frame_pointer_needed);
3689   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3690 }
3691
3692 /* Return TRUE if we can use a simple_return insn.
3693
3694    This function checks whether the callee saved stack is empty, which
3695    means no restore actions are need. The pro_and_epilogue will use
3696    this to check whether shrink-wrapping opt is feasible.  */
3697
3698 bool
3699 aarch64_use_return_insn_p (void)
3700 {
3701   if (!reload_completed)
3702     return false;
3703
3704   if (crtl->profile)
3705     return false;
3706
3707   aarch64_layout_frame ();
3708
3709   return cfun->machine->frame.frame_size == 0;
3710 }
3711
3712 /* Generate the epilogue instructions for returning from a function.
3713    This is almost exactly the reverse of the prolog sequence, except
3714    that we need to insert barriers to avoid scheduling loads that read
3715    from a deallocated stack, and we optimize the unwind records by
3716    emitting them all together if possible.  */
3717 void
3718 aarch64_expand_epilogue (bool for_sibcall)
3719 {
3720   aarch64_layout_frame ();
3721
3722   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3723   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3724   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3725   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3726   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3727   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3728   rtx cfi_ops = NULL;
3729   rtx_insn *insn;
3730
3731   /* We need to add memory barrier to prevent read from deallocated stack.  */
3732   bool need_barrier_p = (get_frame_size ()
3733                          + cfun->machine->frame.saved_varargs_size) != 0;
3734
3735   /* Emit a barrier to prevent loads from a deallocated stack.  */
3736   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3737       || crtl->calls_eh_return)
3738     {
3739       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3740       need_barrier_p = false;
3741     }
3742
3743   /* Restore the stack pointer from the frame pointer if it may not
3744      be the same as the stack pointer.  */
3745   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3746     {
3747       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3748                                        hard_frame_pointer_rtx,
3749                                        GEN_INT (-callee_offset)));
3750       /* If writeback is used when restoring callee-saves, the CFA
3751          is restored on the instruction doing the writeback.  */
3752       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3753     }
3754   else
3755     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3756
3757   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3758                                 callee_adjust != 0, &cfi_ops);
3759   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3760                                 callee_adjust != 0, &cfi_ops);
3761
3762   if (need_barrier_p)
3763     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3764
3765   if (callee_adjust != 0)
3766     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3767
3768   if (callee_adjust != 0 || initial_adjust > 65536)
3769     {
3770       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3771       insn = get_last_insn ();
3772       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3773       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3774       RTX_FRAME_RELATED_P (insn) = 1;
3775       cfi_ops = NULL;
3776     }
3777
3778   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3779
3780   if (cfi_ops)
3781     {
3782       /* Emit delayed restores and reset the CFA to be SP.  */
3783       insn = get_last_insn ();
3784       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3785       REG_NOTES (insn) = cfi_ops;
3786       RTX_FRAME_RELATED_P (insn) = 1;
3787     }
3788
3789   /* We prefer to emit the combined return/authenticate instruction RETAA,
3790      however there are three cases in which we must instead emit an explicit
3791      authentication instruction.
3792
3793         1) Sibcalls don't return in a normal way, so if we're about to call one
3794            we must authenticate.
3795
3796         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3797            generating code for !TARGET_ARMV8_3 we can't use it and must
3798            explicitly authenticate.
3799
3800         3) On an eh_return path we make extra stack adjustments to update the
3801            canonical frame address to be the exception handler's CFA.  We want
3802            to authenticate using the CFA of the function which calls eh_return.
3803     */
3804   if (aarch64_return_address_signing_enabled ()
3805       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3806     {
3807       insn = emit_insn (gen_autisp ());
3808       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3809       RTX_FRAME_RELATED_P (insn) = 1;
3810     }
3811
3812   /* Stack adjustment for exception handler.  */
3813   if (crtl->calls_eh_return)
3814     {
3815       /* We need to unwind the stack by the offset computed by
3816          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3817          to be SP; letting the CFA move during this adjustment
3818          is just as correct as retaining the CFA from the body
3819          of the function.  Therefore, do nothing special.  */
3820       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3821     }
3822
3823   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3824   if (!for_sibcall)
3825     emit_jump_insn (ret_rtx);
3826 }
3827
3828 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3829    normally or return to a previous frame after unwinding.
3830
3831    An EH return uses a single shared return sequence.  The epilogue is
3832    exactly like a normal epilogue except that it has an extra input
3833    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3834    that must be applied after the frame has been destroyed.  An extra label
3835    is inserted before the epilogue which initializes this register to zero,
3836    and this is the entry point for a normal return.
3837
3838    An actual EH return updates the return address, initializes the stack
3839    adjustment and jumps directly into the epilogue (bypassing the zeroing
3840    of the adjustment).  Since the return address is typically saved on the
3841    stack when a function makes a call, the saved LR must be updated outside
3842    the epilogue.
3843
3844    This poses problems as the store is generated well before the epilogue,
3845    so the offset of LR is not known yet.  Also optimizations will remove the
3846    store as it appears dead, even after the epilogue is generated (as the
3847    base or offset for loading LR is different in many cases).
3848
3849    To avoid these problems this implementation forces the frame pointer
3850    in eh_return functions so that the location of LR is fixed and known early.
3851    It also marks the store volatile, so no optimization is permitted to
3852    remove the store.  */
3853 rtx
3854 aarch64_eh_return_handler_rtx (void)
3855 {
3856   rtx tmp = gen_frame_mem (Pmode,
3857     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3858
3859   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3860   MEM_VOLATILE_P (tmp) = true;
3861   return tmp;
3862 }
3863
3864 /* Output code to add DELTA to the first argument, and then jump
3865    to FUNCTION.  Used for C++ multiple inheritance.  */
3866 static void
3867 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3868                          HOST_WIDE_INT delta,
3869                          HOST_WIDE_INT vcall_offset,
3870                          tree function)
3871 {
3872   /* The this pointer is always in x0.  Note that this differs from
3873      Arm where the this pointer maybe bumped to r1 if r0 is required
3874      to return a pointer to an aggregate.  On AArch64 a result value
3875      pointer will be in x8.  */
3876   int this_regno = R0_REGNUM;
3877   rtx this_rtx, temp0, temp1, addr, funexp;
3878   rtx_insn *insn;
3879
3880   reload_completed = 1;
3881   emit_note (NOTE_INSN_PROLOGUE_END);
3882
3883   if (vcall_offset == 0)
3884     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3885   else
3886     {
3887       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3888
3889       this_rtx = gen_rtx_REG (Pmode, this_regno);
3890       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3891       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3892
3893       addr = this_rtx;
3894       if (delta != 0)
3895         {
3896           if (delta >= -256 && delta < 256)
3897             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3898                                        plus_constant (Pmode, this_rtx, delta));
3899           else
3900             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3901         }
3902
3903       if (Pmode == ptr_mode)
3904         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3905       else
3906         aarch64_emit_move (temp0,
3907                            gen_rtx_ZERO_EXTEND (Pmode,
3908                                                 gen_rtx_MEM (ptr_mode, addr)));
3909
3910       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3911           addr = plus_constant (Pmode, temp0, vcall_offset);
3912       else
3913         {
3914           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3915                                           Pmode);
3916           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3917         }
3918
3919       if (Pmode == ptr_mode)
3920         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3921       else
3922         aarch64_emit_move (temp1,
3923                            gen_rtx_SIGN_EXTEND (Pmode,
3924                                                 gen_rtx_MEM (ptr_mode, addr)));
3925
3926       emit_insn (gen_add2_insn (this_rtx, temp1));
3927     }
3928
3929   /* Generate a tail call to the target function.  */
3930   if (!TREE_USED (function))
3931     {
3932       assemble_external (function);
3933       TREE_USED (function) = 1;
3934     }
3935   funexp = XEXP (DECL_RTL (function), 0);
3936   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3937   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3938   SIBLING_CALL_P (insn) = 1;
3939
3940   insn = get_insns ();
3941   shorten_branches (insn);
3942   final_start_function (insn, file, 1);
3943   final (insn, file, 1);
3944   final_end_function ();
3945
3946   /* Stop pretending to be a post-reload pass.  */
3947   reload_completed = 0;
3948 }
3949
3950 static bool
3951 aarch64_tls_referenced_p (rtx x)
3952 {
3953   if (!TARGET_HAVE_TLS)
3954     return false;
3955   subrtx_iterator::array_type array;
3956   FOR_EACH_SUBRTX (iter, array, x, ALL)
3957     {
3958       const_rtx x = *iter;
3959       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3960         return true;
3961       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3962          TLS offsets, not real symbol references.  */
3963       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3964         iter.skip_subrtxes ();
3965     }
3966   return false;
3967 }
3968
3969
3970 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3971    a left shift of 0 or 12 bits.  */
3972 bool
3973 aarch64_uimm12_shift (HOST_WIDE_INT val)
3974 {
3975   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3976           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3977           );
3978 }
3979
3980
3981 /* Return true if val is an immediate that can be loaded into a
3982    register by a MOVZ instruction.  */
3983 static bool
3984 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3985 {
3986   if (GET_MODE_SIZE (mode) > 4)
3987     {
3988       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3989           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3990         return 1;
3991     }
3992   else
3993     {
3994       /* Ignore sign extension.  */
3995       val &= (HOST_WIDE_INT) 0xffffffff;
3996     }
3997   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3998           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3999 }
4000
4001 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4002
4003 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4004   {
4005     0x0000000100000001ull,
4006     0x0001000100010001ull,
4007     0x0101010101010101ull,
4008     0x1111111111111111ull,
4009     0x5555555555555555ull,
4010   };
4011
4012
4013 /* Return true if val is a valid bitmask immediate.  */
4014
4015 bool
4016 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4017 {
4018   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4019   int bits;
4020
4021   /* Check for a single sequence of one bits and return quickly if so.
4022      The special cases of all ones and all zeroes returns false.  */
4023   val = (unsigned HOST_WIDE_INT) val_in;
4024   tmp = val + (val & -val);
4025
4026   if (tmp == (tmp & -tmp))
4027     return (val + 1) > 1;
4028
4029   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4030   if (mode == SImode)
4031     val = (val << 32) | (val & 0xffffffff);
4032
4033   /* Invert if the immediate doesn't start with a zero bit - this means we
4034      only need to search for sequences of one bits.  */
4035   if (val & 1)
4036     val = ~val;
4037
4038   /* Find the first set bit and set tmp to val with the first sequence of one
4039      bits removed.  Return success if there is a single sequence of ones.  */
4040   first_one = val & -val;
4041   tmp = val & (val + first_one);
4042
4043   if (tmp == 0)
4044     return true;
4045
4046   /* Find the next set bit and compute the difference in bit position.  */
4047   next_one = tmp & -tmp;
4048   bits = clz_hwi (first_one) - clz_hwi (next_one);
4049   mask = val ^ tmp;
4050
4051   /* Check the bit position difference is a power of 2, and that the first
4052      sequence of one bits fits within 'bits' bits.  */
4053   if ((mask >> bits) != 0 || bits != (bits & -bits))
4054     return false;
4055
4056   /* Check the sequence of one bits is repeated 64/bits times.  */
4057   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4058 }
4059
4060 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4061    Assumed precondition: VAL_IN Is not zero.  */
4062
4063 unsigned HOST_WIDE_INT
4064 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4065 {
4066   int lowest_bit_set = ctz_hwi (val_in);
4067   int highest_bit_set = floor_log2 (val_in);
4068   gcc_assert (val_in != 0);
4069
4070   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4071           (HOST_WIDE_INT_1U << lowest_bit_set));
4072 }
4073
4074 /* Create constant where bits outside of lowest bit set to highest bit set
4075    are set to 1.  */
4076
4077 unsigned HOST_WIDE_INT
4078 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4079 {
4080   return val_in | ~aarch64_and_split_imm1 (val_in);
4081 }
4082
4083 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4084
4085 bool
4086 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4087 {
4088   if (aarch64_bitmask_imm (val_in, mode))
4089     return false;
4090
4091   if (aarch64_move_imm (val_in, mode))
4092     return false;
4093
4094   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4095
4096   return aarch64_bitmask_imm (imm2, mode);
4097 }
4098
4099 /* Return true if val is an immediate that can be loaded into a
4100    register in a single instruction.  */
4101 bool
4102 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4103 {
4104   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4105     return 1;
4106   return aarch64_bitmask_imm (val, mode);
4107 }
4108
4109 static bool
4110 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4111 {
4112   rtx base, offset;
4113
4114   if (GET_CODE (x) == HIGH)
4115     return true;
4116
4117   split_const (x, &base, &offset);
4118   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4119     {
4120       if (aarch64_classify_symbol (base, offset)
4121           != SYMBOL_FORCE_TO_MEM)
4122         return true;
4123       else
4124         /* Avoid generating a 64-bit relocation in ILP32; leave
4125            to aarch64_expand_mov_immediate to handle it properly.  */
4126         return mode != ptr_mode;
4127     }
4128
4129   return aarch64_tls_referenced_p (x);
4130 }
4131
4132 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4133    The expansion for a table switch is quite expensive due to the number
4134    of instructions, the table lookup and hard to predict indirect jump.
4135    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4136    set, otherwise use tables for > 16 cases as a tradeoff between size and
4137    performance.  When optimizing for size, use the default setting.  */
4138
4139 static unsigned int
4140 aarch64_case_values_threshold (void)
4141 {
4142   /* Use the specified limit for the number of cases before using jump
4143      tables at higher optimization levels.  */
4144   if (optimize > 2
4145       && selected_cpu->tune->max_case_values != 0)
4146     return selected_cpu->tune->max_case_values;
4147   else
4148     return optimize_size ? default_case_values_threshold () : 17;
4149 }
4150
4151 /* Return true if register REGNO is a valid index register.
4152    STRICT_P is true if REG_OK_STRICT is in effect.  */
4153
4154 bool
4155 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4156 {
4157   if (!HARD_REGISTER_NUM_P (regno))
4158     {
4159       if (!strict_p)
4160         return true;
4161
4162       if (!reg_renumber)
4163         return false;
4164
4165       regno = reg_renumber[regno];
4166     }
4167   return GP_REGNUM_P (regno);
4168 }
4169
4170 /* Return true if register REGNO is a valid base register for mode MODE.
4171    STRICT_P is true if REG_OK_STRICT is in effect.  */
4172
4173 bool
4174 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4175 {
4176   if (!HARD_REGISTER_NUM_P (regno))
4177     {
4178       if (!strict_p)
4179         return true;
4180
4181       if (!reg_renumber)
4182         return false;
4183
4184       regno = reg_renumber[regno];
4185     }
4186
4187   /* The fake registers will be eliminated to either the stack or
4188      hard frame pointer, both of which are usually valid base registers.
4189      Reload deals with the cases where the eliminated form isn't valid.  */
4190   return (GP_REGNUM_P (regno)
4191           || regno == SP_REGNUM
4192           || regno == FRAME_POINTER_REGNUM
4193           || regno == ARG_POINTER_REGNUM);
4194 }
4195
4196 /* Return true if X is a valid base register for mode MODE.
4197    STRICT_P is true if REG_OK_STRICT is in effect.  */
4198
4199 static bool
4200 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4201 {
4202   if (!strict_p && GET_CODE (x) == SUBREG)
4203     x = SUBREG_REG (x);
4204
4205   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4206 }
4207
4208 /* Return true if address offset is a valid index.  If it is, fill in INFO
4209    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4210
4211 static bool
4212 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4213                         machine_mode mode, bool strict_p)
4214 {
4215   enum aarch64_address_type type;
4216   rtx index;
4217   int shift;
4218
4219   /* (reg:P) */
4220   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4221       && GET_MODE (x) == Pmode)
4222     {
4223       type = ADDRESS_REG_REG;
4224       index = x;
4225       shift = 0;
4226     }
4227   /* (sign_extend:DI (reg:SI)) */
4228   else if ((GET_CODE (x) == SIGN_EXTEND
4229             || GET_CODE (x) == ZERO_EXTEND)
4230            && GET_MODE (x) == DImode
4231            && GET_MODE (XEXP (x, 0)) == SImode)
4232     {
4233       type = (GET_CODE (x) == SIGN_EXTEND)
4234         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4235       index = XEXP (x, 0);
4236       shift = 0;
4237     }
4238   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4239   else if (GET_CODE (x) == MULT
4240            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4241                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4242            && GET_MODE (XEXP (x, 0)) == DImode
4243            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4244            && CONST_INT_P (XEXP (x, 1)))
4245     {
4246       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4247         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4248       index = XEXP (XEXP (x, 0), 0);
4249       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4250     }
4251   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4252   else if (GET_CODE (x) == ASHIFT
4253            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4254                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4255            && GET_MODE (XEXP (x, 0)) == DImode
4256            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4257            && CONST_INT_P (XEXP (x, 1)))
4258     {
4259       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4260         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4261       index = XEXP (XEXP (x, 0), 0);
4262       shift = INTVAL (XEXP (x, 1));
4263     }
4264   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4265   else if ((GET_CODE (x) == SIGN_EXTRACT
4266             || GET_CODE (x) == ZERO_EXTRACT)
4267            && GET_MODE (x) == DImode
4268            && GET_CODE (XEXP (x, 0)) == MULT
4269            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4270            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4271     {
4272       type = (GET_CODE (x) == SIGN_EXTRACT)
4273         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4274       index = XEXP (XEXP (x, 0), 0);
4275       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4276       if (INTVAL (XEXP (x, 1)) != 32 + shift
4277           || INTVAL (XEXP (x, 2)) != 0)
4278         shift = -1;
4279     }
4280   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4281      (const_int 0xffffffff<<shift)) */
4282   else if (GET_CODE (x) == AND
4283            && GET_MODE (x) == DImode
4284            && GET_CODE (XEXP (x, 0)) == MULT
4285            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4286            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4287            && CONST_INT_P (XEXP (x, 1)))
4288     {
4289       type = ADDRESS_REG_UXTW;
4290       index = XEXP (XEXP (x, 0), 0);
4291       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4292       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4293         shift = -1;
4294     }
4295   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4296   else if ((GET_CODE (x) == SIGN_EXTRACT
4297             || GET_CODE (x) == ZERO_EXTRACT)
4298            && GET_MODE (x) == DImode
4299            && GET_CODE (XEXP (x, 0)) == ASHIFT
4300            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4301            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4302     {
4303       type = (GET_CODE (x) == SIGN_EXTRACT)
4304         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4305       index = XEXP (XEXP (x, 0), 0);
4306       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4307       if (INTVAL (XEXP (x, 1)) != 32 + shift
4308           || INTVAL (XEXP (x, 2)) != 0)
4309         shift = -1;
4310     }
4311   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4312      (const_int 0xffffffff<<shift)) */
4313   else if (GET_CODE (x) == AND
4314            && GET_MODE (x) == DImode
4315            && GET_CODE (XEXP (x, 0)) == ASHIFT
4316            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4317            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4318            && CONST_INT_P (XEXP (x, 1)))
4319     {
4320       type = ADDRESS_REG_UXTW;
4321       index = XEXP (XEXP (x, 0), 0);
4322       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4323       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4324         shift = -1;
4325     }
4326   /* (mult:P (reg:P) (const_int scale)) */
4327   else if (GET_CODE (x) == MULT
4328            && GET_MODE (x) == Pmode
4329            && GET_MODE (XEXP (x, 0)) == Pmode
4330            && CONST_INT_P (XEXP (x, 1)))
4331     {
4332       type = ADDRESS_REG_REG;
4333       index = XEXP (x, 0);
4334       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4335     }
4336   /* (ashift:P (reg:P) (const_int shift)) */
4337   else if (GET_CODE (x) == ASHIFT
4338            && GET_MODE (x) == Pmode
4339            && GET_MODE (XEXP (x, 0)) == Pmode
4340            && CONST_INT_P (XEXP (x, 1)))
4341     {
4342       type = ADDRESS_REG_REG;
4343       index = XEXP (x, 0);
4344       shift = INTVAL (XEXP (x, 1));
4345     }
4346   else
4347     return false;
4348
4349   if (GET_CODE (index) == SUBREG)
4350     index = SUBREG_REG (index);
4351
4352   if ((shift == 0 ||
4353        (shift > 0 && shift <= 3
4354         && (1 << shift) == GET_MODE_SIZE (mode)))
4355       && REG_P (index)
4356       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4357     {
4358       info->type = type;
4359       info->offset = index;
4360       info->shift = shift;
4361       return true;
4362     }
4363
4364   return false;
4365 }
4366
4367 /* Return true if MODE is one of the modes for which we
4368    support LDP/STP operations.  */
4369
4370 static bool
4371 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4372 {
4373   return mode == SImode || mode == DImode
4374          || mode == SFmode || mode == DFmode
4375          || (aarch64_vector_mode_supported_p (mode)
4376              && GET_MODE_SIZE (mode) == 8);
4377 }
4378
4379 /* Return true if REGNO is a virtual pointer register, or an eliminable
4380    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4381    include stack_pointer or hard_frame_pointer.  */
4382 static bool
4383 virt_or_elim_regno_p (unsigned regno)
4384 {
4385   return ((regno >= FIRST_VIRTUAL_REGISTER
4386            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4387           || regno == FRAME_POINTER_REGNUM
4388           || regno == ARG_POINTER_REGNUM);
4389 }
4390
4391 /* Return true if X is a valid address for machine mode MODE.  If it is,
4392    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4393    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4394
4395 static bool
4396 aarch64_classify_address (struct aarch64_address_info *info,
4397                           rtx x, machine_mode mode,
4398                           RTX_CODE outer_code, bool strict_p)
4399 {
4400   enum rtx_code code = GET_CODE (x);
4401   rtx op0, op1;
4402
4403   /* On BE, we use load/store pair for all large int mode load/stores.
4404      TI/TFmode may also use a load/store pair.  */
4405   bool load_store_pair_p = (outer_code == PARALLEL
4406                             || mode == TImode
4407                             || mode == TFmode
4408                             || (BYTES_BIG_ENDIAN
4409                                 && aarch64_vect_struct_mode_p (mode)));
4410
4411   bool allow_reg_index_p =
4412     !load_store_pair_p
4413     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4414     && !aarch64_vect_struct_mode_p (mode);
4415
4416   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4417      REG addressing.  */
4418   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4419       && (code != POST_INC && code != REG))
4420     return false;
4421
4422   switch (code)
4423     {
4424     case REG:
4425     case SUBREG:
4426       info->type = ADDRESS_REG_IMM;
4427       info->base = x;
4428       info->offset = const0_rtx;
4429       return aarch64_base_register_rtx_p (x, strict_p);
4430
4431     case PLUS:
4432       op0 = XEXP (x, 0);
4433       op1 = XEXP (x, 1);
4434
4435       if (! strict_p
4436           && REG_P (op0)
4437           && virt_or_elim_regno_p (REGNO (op0))
4438           && CONST_INT_P (op1))
4439         {
4440           info->type = ADDRESS_REG_IMM;
4441           info->base = op0;
4442           info->offset = op1;
4443
4444           return true;
4445         }
4446
4447       if (GET_MODE_SIZE (mode) != 0
4448           && CONST_INT_P (op1)
4449           && aarch64_base_register_rtx_p (op0, strict_p))
4450         {
4451           HOST_WIDE_INT offset = INTVAL (op1);
4452
4453           info->type = ADDRESS_REG_IMM;
4454           info->base = op0;
4455           info->offset = op1;
4456
4457           /* TImode and TFmode values are allowed in both pairs of X
4458              registers and individual Q registers.  The available
4459              address modes are:
4460              X,X: 7-bit signed scaled offset
4461              Q:   9-bit signed offset
4462              We conservatively require an offset representable in either mode.
4463              When performing the check for pairs of X registers i.e.  LDP/STP
4464              pass down DImode since that is the natural size of the LDP/STP
4465              instruction memory accesses.  */
4466           if (mode == TImode || mode == TFmode)
4467             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4468                     && (offset_9bit_signed_unscaled_p (mode, offset)
4469                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4470
4471           /* A 7bit offset check because OImode will emit a ldp/stp
4472              instruction (only big endian will get here).
4473              For ldp/stp instructions, the offset is scaled for the size of a
4474              single element of the pair.  */
4475           if (mode == OImode)
4476             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4477
4478           /* Three 9/12 bit offsets checks because CImode will emit three
4479              ldr/str instructions (only big endian will get here).  */
4480           if (mode == CImode)
4481             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4482                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4483                         || offset_12bit_unsigned_scaled_p (V16QImode,
4484                                                            offset + 32)));
4485
4486           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4487              instructions (only big endian will get here).  */
4488           if (mode == XImode)
4489             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4490                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4491                                                             offset + 32));
4492
4493           if (load_store_pair_p)
4494             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4495                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4496           else
4497             return (offset_9bit_signed_unscaled_p (mode, offset)
4498                     || offset_12bit_unsigned_scaled_p (mode, offset));
4499         }
4500
4501       if (allow_reg_index_p)
4502         {
4503           /* Look for base + (scaled/extended) index register.  */
4504           if (aarch64_base_register_rtx_p (op0, strict_p)
4505               && aarch64_classify_index (info, op1, mode, strict_p))
4506             {
4507               info->base = op0;
4508               return true;
4509             }
4510           if (aarch64_base_register_rtx_p (op1, strict_p)
4511               && aarch64_classify_index (info, op0, mode, strict_p))
4512             {
4513               info->base = op1;
4514               return true;
4515             }
4516         }
4517
4518       return false;
4519
4520     case POST_INC:
4521     case POST_DEC:
4522     case PRE_INC:
4523     case PRE_DEC:
4524       info->type = ADDRESS_REG_WB;
4525       info->base = XEXP (x, 0);
4526       info->offset = NULL_RTX;
4527       return aarch64_base_register_rtx_p (info->base, strict_p);
4528
4529     case POST_MODIFY:
4530     case PRE_MODIFY:
4531       info->type = ADDRESS_REG_WB;
4532       info->base = XEXP (x, 0);
4533       if (GET_CODE (XEXP (x, 1)) == PLUS
4534           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4535           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4536           && aarch64_base_register_rtx_p (info->base, strict_p))
4537         {
4538           HOST_WIDE_INT offset;
4539           info->offset = XEXP (XEXP (x, 1), 1);
4540           offset = INTVAL (info->offset);
4541
4542           /* TImode and TFmode values are allowed in both pairs of X
4543              registers and individual Q registers.  The available
4544              address modes are:
4545              X,X: 7-bit signed scaled offset
4546              Q:   9-bit signed offset
4547              We conservatively require an offset representable in either mode.
4548            */
4549           if (mode == TImode || mode == TFmode)
4550             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4551                     && offset_9bit_signed_unscaled_p (mode, offset));
4552
4553           if (load_store_pair_p)
4554             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4555                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4556           else
4557             return offset_9bit_signed_unscaled_p (mode, offset);
4558         }
4559       return false;
4560
4561     case CONST:
4562     case SYMBOL_REF:
4563     case LABEL_REF:
4564       /* load literal: pc-relative constant pool entry.  Only supported
4565          for SI mode or larger.  */
4566       info->type = ADDRESS_SYMBOLIC;
4567
4568       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4569         {
4570           rtx sym, addend;
4571
4572           split_const (x, &sym, &addend);
4573           return ((GET_CODE (sym) == LABEL_REF
4574                    || (GET_CODE (sym) == SYMBOL_REF
4575                        && CONSTANT_POOL_ADDRESS_P (sym)
4576                        && aarch64_pcrelative_literal_loads)));
4577         }
4578       return false;
4579
4580     case LO_SUM:
4581       info->type = ADDRESS_LO_SUM;
4582       info->base = XEXP (x, 0);
4583       info->offset = XEXP (x, 1);
4584       if (allow_reg_index_p
4585           && aarch64_base_register_rtx_p (info->base, strict_p))
4586         {
4587           rtx sym, offs;
4588           split_const (info->offset, &sym, &offs);
4589           if (GET_CODE (sym) == SYMBOL_REF
4590               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4591             {
4592               /* The symbol and offset must be aligned to the access size.  */
4593               unsigned int align;
4594               unsigned int ref_size;
4595
4596               if (CONSTANT_POOL_ADDRESS_P (sym))
4597                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4598               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4599                 {
4600                   tree exp = SYMBOL_REF_DECL (sym);
4601                   align = TYPE_ALIGN (TREE_TYPE (exp));
4602                   align = CONSTANT_ALIGNMENT (exp, align);
4603                 }
4604               else if (SYMBOL_REF_DECL (sym))
4605                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4606               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4607                        && SYMBOL_REF_BLOCK (sym) != NULL)
4608                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4609               else
4610                 align = BITS_PER_UNIT;
4611
4612               ref_size = GET_MODE_SIZE (mode);
4613               if (ref_size == 0)
4614                 ref_size = GET_MODE_SIZE (DImode);
4615
4616               return ((INTVAL (offs) & (ref_size - 1)) == 0
4617                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4618             }
4619         }
4620       return false;
4621
4622     default:
4623       return false;
4624     }
4625 }
4626
4627 /* Return true if the address X is valid for a PRFM instruction.
4628    STRICT_P is true if we should do strict checking with
4629    aarch64_classify_address.  */
4630
4631 bool
4632 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4633 {
4634   struct aarch64_address_info addr;
4635
4636   /* PRFM accepts the same addresses as DImode...  */
4637   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4638   if (!res)
4639     return false;
4640
4641   /* ... except writeback forms.  */
4642   return addr.type != ADDRESS_REG_WB;
4643 }
4644
4645 bool
4646 aarch64_symbolic_address_p (rtx x)
4647 {
4648   rtx offset;
4649
4650   split_const (x, &x, &offset);
4651   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4652 }
4653
4654 /* Classify the base of symbolic expression X.  */
4655
4656 enum aarch64_symbol_type
4657 aarch64_classify_symbolic_expression (rtx x)
4658 {
4659   rtx offset;
4660
4661   split_const (x, &x, &offset);
4662   return aarch64_classify_symbol (x, offset);
4663 }
4664
4665
4666 /* Return TRUE if X is a legitimate address for accessing memory in
4667    mode MODE.  */
4668 static bool
4669 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4670 {
4671   struct aarch64_address_info addr;
4672
4673   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4674 }
4675
4676 /* Return TRUE if X is a legitimate address for accessing memory in
4677    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4678    pair operation.  */
4679 bool
4680 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4681                               RTX_CODE outer_code, bool strict_p)
4682 {
4683   struct aarch64_address_info addr;
4684
4685   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4686 }
4687
4688 /* Split an out-of-range address displacement into a base and offset.
4689    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4690    to increase opportunities for sharing the base address of different sizes.
4691    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4692 static bool
4693 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4694 {
4695   HOST_WIDE_INT offset = INTVAL (*disp);
4696   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4697
4698   if (mode == TImode || mode == TFmode
4699       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4700     base = (offset + 0x100) & ~0x1ff;
4701
4702   *off = GEN_INT (base);
4703   *disp = GEN_INT (offset - base);
4704   return true;
4705 }
4706
4707 /* Return the binary representation of floating point constant VALUE in INTVAL.
4708    If the value cannot be converted, return false without setting INTVAL.
4709    The conversion is done in the given MODE.  */
4710 bool
4711 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4712 {
4713
4714   /* We make a general exception for 0.  */
4715   if (aarch64_float_const_zero_rtx_p (value))
4716     {
4717       *intval = 0;
4718       return true;
4719     }
4720
4721   machine_mode mode = GET_MODE (value);
4722   if (GET_CODE (value) != CONST_DOUBLE
4723       || !SCALAR_FLOAT_MODE_P (mode)
4724       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4725       /* Only support up to DF mode.  */
4726       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4727     return false;
4728
4729   unsigned HOST_WIDE_INT ival = 0;
4730
4731   long res[2];
4732   real_to_target (res,
4733                   CONST_DOUBLE_REAL_VALUE (value),
4734                   REAL_MODE_FORMAT (mode));
4735
4736   ival = zext_hwi (res[0], 32);
4737   if (GET_MODE_BITSIZE (mode) == GET_MODE_BITSIZE (DFmode))
4738     ival |= (zext_hwi (res[1], 32) << 32);
4739
4740   *intval = ival;
4741   return true;
4742 }
4743
4744 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4745    single MOV(+MOVK) followed by an FMOV.  */
4746 bool
4747 aarch64_float_const_rtx_p (rtx x)
4748 {
4749   machine_mode mode = GET_MODE (x);
4750   if (mode == VOIDmode)
4751     return false;
4752
4753   /* Determine whether it's cheaper to write float constants as
4754      mov/movk pairs over ldr/adrp pairs.  */
4755   unsigned HOST_WIDE_INT ival;
4756
4757   if (GET_CODE (x) == CONST_DOUBLE
4758       && SCALAR_FLOAT_MODE_P (mode)
4759       && aarch64_reinterpret_float_as_int (x, &ival))
4760     {
4761       machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
4762       int num_instr = aarch64_internal_mov_immediate
4763                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4764       return num_instr < 3;
4765     }
4766
4767   return false;
4768 }
4769
4770 /* Return TRUE if rtx X is immediate constant 0.0 */
4771 bool
4772 aarch64_float_const_zero_rtx_p (rtx x)
4773 {
4774   if (GET_MODE (x) == VOIDmode)
4775     return false;
4776
4777   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4778     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4779   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4780 }
4781
4782 /* Return TRUE if rtx X is immediate constant that fits in a single
4783    MOVI immediate operation.  */
4784 bool
4785 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4786 {
4787   if (!TARGET_SIMD)
4788      return false;
4789
4790   machine_mode vmode, imode;
4791   unsigned HOST_WIDE_INT ival;
4792
4793   if (GET_CODE (x) == CONST_DOUBLE
4794       && SCALAR_FLOAT_MODE_P (mode))
4795     {
4796       if (!aarch64_reinterpret_float_as_int (x, &ival))
4797         return false;
4798
4799       /* We make a general exception for 0.  */
4800       if (aarch64_float_const_zero_rtx_p (x))
4801         return true;
4802
4803       imode = int_mode_for_mode (mode);
4804     }
4805   else if (GET_CODE (x) == CONST_INT
4806            && SCALAR_INT_MODE_P (mode))
4807     {
4808        imode = mode;
4809        ival = INTVAL (x);
4810     }
4811   else
4812     return false;
4813
4814    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4815      a 128 bit vector mode.  */
4816   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4817
4818   vmode = aarch64_simd_container_mode (imode, width);
4819   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4820
4821   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4822 }
4823
4824
4825 /* Return the fixed registers used for condition codes.  */
4826
4827 static bool
4828 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4829 {
4830   *p1 = CC_REGNUM;
4831   *p2 = INVALID_REGNUM;
4832   return true;
4833 }
4834
4835 /* This function is used by the call expanders of the machine description.
4836    RESULT is the register in which the result is returned.  It's NULL for
4837    "call" and "sibcall".
4838    MEM is the location of the function call.
4839    SIBCALL indicates whether this function call is normal call or sibling call.
4840    It will generate different pattern accordingly.  */
4841
4842 void
4843 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4844 {
4845   rtx call, callee, tmp;
4846   rtvec vec;
4847   machine_mode mode;
4848
4849   gcc_assert (MEM_P (mem));
4850   callee = XEXP (mem, 0);
4851   mode = GET_MODE (callee);
4852   gcc_assert (mode == Pmode);
4853
4854   /* Decide if we should generate indirect calls by loading the
4855      address of the callee into a register before performing
4856      the branch-and-link.  */
4857   if (SYMBOL_REF_P (callee)
4858       ? (aarch64_is_long_call_p (callee)
4859          || aarch64_is_noplt_call_p (callee))
4860       : !REG_P (callee))
4861     XEXP (mem, 0) = force_reg (mode, callee);
4862
4863   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4864
4865   if (result != NULL_RTX)
4866     call = gen_rtx_SET (result, call);
4867
4868   if (sibcall)
4869     tmp = ret_rtx;
4870   else
4871     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4872
4873   vec = gen_rtvec (2, call, tmp);
4874   call = gen_rtx_PARALLEL (VOIDmode, vec);
4875
4876   aarch64_emit_call_insn (call);
4877 }
4878
4879 /* Emit call insn with PAT and do aarch64-specific handling.  */
4880
4881 void
4882 aarch64_emit_call_insn (rtx pat)
4883 {
4884   rtx insn = emit_call_insn (pat);
4885
4886   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4887   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4888   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4889 }
4890
4891 machine_mode
4892 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4893 {
4894   /* All floating point compares return CCFP if it is an equality
4895      comparison, and CCFPE otherwise.  */
4896   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4897     {
4898       switch (code)
4899         {
4900         case EQ:
4901         case NE:
4902         case UNORDERED:
4903         case ORDERED:
4904         case UNLT:
4905         case UNLE:
4906         case UNGT:
4907         case UNGE:
4908         case UNEQ:
4909         case LTGT:
4910           return CCFPmode;
4911
4912         case LT:
4913         case LE:
4914         case GT:
4915         case GE:
4916           return CCFPEmode;
4917
4918         default:
4919           gcc_unreachable ();
4920         }
4921     }
4922
4923   /* Equality comparisons of short modes against zero can be performed
4924      using the TST instruction with the appropriate bitmask.  */
4925   if (y == const0_rtx && REG_P (x)
4926       && (code == EQ || code == NE)
4927       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4928     return CC_NZmode;
4929
4930   /* Similarly, comparisons of zero_extends from shorter modes can
4931      be performed using an ANDS with an immediate mask.  */
4932   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4933       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4934       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4935       && (code == EQ || code == NE))
4936     return CC_NZmode;
4937
4938   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4939       && y == const0_rtx
4940       && (code == EQ || code == NE || code == LT || code == GE)
4941       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4942           || GET_CODE (x) == NEG
4943           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4944               && CONST_INT_P (XEXP (x, 2)))))
4945     return CC_NZmode;
4946
4947   /* A compare with a shifted operand.  Because of canonicalization,
4948      the comparison will have to be swapped when we emit the assembly
4949      code.  */
4950   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4951       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4952       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4953           || GET_CODE (x) == LSHIFTRT
4954           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4955     return CC_SWPmode;
4956
4957   /* Similarly for a negated operand, but we can only do this for
4958      equalities.  */
4959   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4960       && (REG_P (y) || GET_CODE (y) == SUBREG)
4961       && (code == EQ || code == NE)
4962       && GET_CODE (x) == NEG)
4963     return CC_Zmode;
4964
4965   /* A test for unsigned overflow.  */
4966   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4967       && code == NE
4968       && GET_CODE (x) == PLUS
4969       && GET_CODE (y) == ZERO_EXTEND)
4970     return CC_Cmode;
4971
4972   /* For everything else, return CCmode.  */
4973   return CCmode;
4974 }
4975
4976 static int
4977 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4978
4979 int
4980 aarch64_get_condition_code (rtx x)
4981 {
4982   machine_mode mode = GET_MODE (XEXP (x, 0));
4983   enum rtx_code comp_code = GET_CODE (x);
4984
4985   if (GET_MODE_CLASS (mode) != MODE_CC)
4986     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4987   return aarch64_get_condition_code_1 (mode, comp_code);
4988 }
4989
4990 static int
4991 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
4992 {
4993   switch (mode)
4994     {
4995     case CCFPmode:
4996     case CCFPEmode:
4997       switch (comp_code)
4998         {
4999         case GE: return AARCH64_GE;
5000         case GT: return AARCH64_GT;
5001         case LE: return AARCH64_LS;
5002         case LT: return AARCH64_MI;
5003         case NE: return AARCH64_NE;
5004         case EQ: return AARCH64_EQ;
5005         case ORDERED: return AARCH64_VC;
5006         case UNORDERED: return AARCH64_VS;
5007         case UNLT: return AARCH64_LT;
5008         case UNLE: return AARCH64_LE;
5009         case UNGT: return AARCH64_HI;
5010         case UNGE: return AARCH64_PL;
5011         default: return -1;
5012         }
5013       break;
5014
5015     case CCmode:
5016       switch (comp_code)
5017         {
5018         case NE: return AARCH64_NE;
5019         case EQ: return AARCH64_EQ;
5020         case GE: return AARCH64_GE;
5021         case GT: return AARCH64_GT;
5022         case LE: return AARCH64_LE;
5023         case LT: return AARCH64_LT;
5024         case GEU: return AARCH64_CS;
5025         case GTU: return AARCH64_HI;
5026         case LEU: return AARCH64_LS;
5027         case LTU: return AARCH64_CC;
5028         default: return -1;
5029         }
5030       break;
5031
5032     case CC_SWPmode:
5033       switch (comp_code)
5034         {
5035         case NE: return AARCH64_NE;
5036         case EQ: return AARCH64_EQ;
5037         case GE: return AARCH64_LE;
5038         case GT: return AARCH64_LT;
5039         case LE: return AARCH64_GE;
5040         case LT: return AARCH64_GT;
5041         case GEU: return AARCH64_LS;
5042         case GTU: return AARCH64_CC;
5043         case LEU: return AARCH64_CS;
5044         case LTU: return AARCH64_HI;
5045         default: return -1;
5046         }
5047       break;
5048
5049     case CC_NZmode:
5050       switch (comp_code)
5051         {
5052         case NE: return AARCH64_NE;
5053         case EQ: return AARCH64_EQ;
5054         case GE: return AARCH64_PL;
5055         case LT: return AARCH64_MI;
5056         default: return -1;
5057         }
5058       break;
5059
5060     case CC_Zmode:
5061       switch (comp_code)
5062         {
5063         case NE: return AARCH64_NE;
5064         case EQ: return AARCH64_EQ;
5065         default: return -1;
5066         }
5067       break;
5068
5069     case CC_Cmode:
5070       switch (comp_code)
5071         {
5072         case NE: return AARCH64_CS;
5073         case EQ: return AARCH64_CC;
5074         default: return -1;
5075         }
5076       break;
5077
5078     default:
5079       return -1;
5080     }
5081
5082   return -1;
5083 }
5084
5085 bool
5086 aarch64_const_vec_all_same_in_range_p (rtx x,
5087                                   HOST_WIDE_INT minval,
5088                                   HOST_WIDE_INT maxval)
5089 {
5090   HOST_WIDE_INT firstval;
5091   int count, i;
5092
5093   if (GET_CODE (x) != CONST_VECTOR
5094       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5095     return false;
5096
5097   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5098   if (firstval < minval || firstval > maxval)
5099     return false;
5100
5101   count = CONST_VECTOR_NUNITS (x);
5102   for (i = 1; i < count; i++)
5103     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5104       return false;
5105
5106   return true;
5107 }
5108
5109 bool
5110 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5111 {
5112   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5113 }
5114
5115
5116 /* N Z C V.  */
5117 #define AARCH64_CC_V 1
5118 #define AARCH64_CC_C (1 << 1)
5119 #define AARCH64_CC_Z (1 << 2)
5120 #define AARCH64_CC_N (1 << 3)
5121
5122 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5123 static const int aarch64_nzcv_codes[] =
5124 {
5125   0,            /* EQ, Z == 1.  */
5126   AARCH64_CC_Z, /* NE, Z == 0.  */
5127   0,            /* CS, C == 1.  */
5128   AARCH64_CC_C, /* CC, C == 0.  */
5129   0,            /* MI, N == 1.  */
5130   AARCH64_CC_N, /* PL, N == 0.  */
5131   0,            /* VS, V == 1.  */
5132   AARCH64_CC_V, /* VC, V == 0.  */
5133   0,            /* HI, C ==1 && Z == 0.  */
5134   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5135   AARCH64_CC_V, /* GE, N == V.  */
5136   0,            /* LT, N != V.  */
5137   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5138   0,            /* LE, !(Z == 0 && N == V).  */
5139   0,            /* AL, Any.  */
5140   0             /* NV, Any.  */
5141 };
5142
5143 /* Print operand X to file F in a target specific manner according to CODE.
5144    The acceptable formatting commands given by CODE are:
5145      'c':               An integer or symbol address without a preceding #
5146                         sign.
5147      'e':               Print the sign/zero-extend size as a character 8->b,
5148                         16->h, 32->w.
5149      'p':               Prints N such that 2^N == X (X must be power of 2 and
5150                         const int).
5151      'P':               Print the number of non-zero bits in X (a const_int).
5152      'H':               Print the higher numbered register of a pair (TImode)
5153                         of regs.
5154      'm':               Print a condition (eq, ne, etc).
5155      'M':               Same as 'm', but invert condition.
5156      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5157      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5158                         The register printed is the FP/SIMD register name
5159                         of X + 0/1/2/3 for S/T/U/V.
5160      'R':               Print a scalar FP/SIMD register name + 1.
5161      'X':               Print bottom 16 bits of integer constant in hex.
5162      'w/x':             Print a general register name or the zero register
5163                         (32-bit or 64-bit).
5164      '0':               Print a normal operand, if it's a general register,
5165                         then we assume DImode.
5166      'k':               Print NZCV for conditional compare instructions.
5167      'A':               Output address constant representing the first
5168                         argument of X, specifying a relocation offset
5169                         if appropriate.
5170      'L':               Output constant address specified by X
5171                         with a relocation offset if appropriate.
5172      'G':               Prints address of X, specifying a PC relative
5173                         relocation mode if appropriate.  */
5174
5175 static void
5176 aarch64_print_operand (FILE *f, rtx x, int code)
5177 {
5178   switch (code)
5179     {
5180     case 'c':
5181       switch (GET_CODE (x))
5182         {
5183         case CONST_INT:
5184           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5185           break;
5186
5187         case SYMBOL_REF:
5188           output_addr_const (f, x);
5189           break;
5190
5191         case CONST:
5192           if (GET_CODE (XEXP (x, 0)) == PLUS
5193               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5194             {
5195               output_addr_const (f, x);
5196               break;
5197             }
5198           /* Fall through.  */
5199
5200         default:
5201           output_operand_lossage ("Unsupported operand for code '%c'", code);
5202         }
5203       break;
5204
5205     case 'e':
5206       {
5207         int n;
5208
5209         if (!CONST_INT_P (x)
5210             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5211           {
5212             output_operand_lossage ("invalid operand for '%%%c'", code);
5213             return;
5214           }
5215
5216         switch (n)
5217           {
5218           case 3:
5219             fputc ('b', f);
5220             break;
5221           case 4:
5222             fputc ('h', f);
5223             break;
5224           case 5:
5225             fputc ('w', f);
5226             break;
5227           default:
5228             output_operand_lossage ("invalid operand for '%%%c'", code);
5229             return;
5230           }
5231       }
5232       break;
5233
5234     case 'p':
5235       {
5236         int n;
5237
5238         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5239           {
5240             output_operand_lossage ("invalid operand for '%%%c'", code);
5241             return;
5242           }
5243
5244         asm_fprintf (f, "%d", n);
5245       }
5246       break;
5247
5248     case 'P':
5249       if (!CONST_INT_P (x))
5250         {
5251           output_operand_lossage ("invalid operand for '%%%c'", code);
5252           return;
5253         }
5254
5255       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5256       break;
5257
5258     case 'H':
5259       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5260         {
5261           output_operand_lossage ("invalid operand for '%%%c'", code);
5262           return;
5263         }
5264
5265       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5266       break;
5267
5268     case 'M':
5269     case 'm':
5270       {
5271         int cond_code;
5272         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5273         if (x == const_true_rtx)
5274           {
5275             if (code == 'M')
5276               fputs ("nv", f);
5277             return;
5278           }
5279
5280         if (!COMPARISON_P (x))
5281           {
5282             output_operand_lossage ("invalid operand for '%%%c'", code);
5283             return;
5284           }
5285
5286         cond_code = aarch64_get_condition_code (x);
5287         gcc_assert (cond_code >= 0);
5288         if (code == 'M')
5289           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5290         fputs (aarch64_condition_codes[cond_code], f);
5291       }
5292       break;
5293
5294     case 'b':
5295     case 'h':
5296     case 's':
5297     case 'd':
5298     case 'q':
5299       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5300         {
5301           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5302           return;
5303         }
5304       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5305       break;
5306
5307     case 'S':
5308     case 'T':
5309     case 'U':
5310     case 'V':
5311       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5312         {
5313           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5314           return;
5315         }
5316       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5317       break;
5318
5319     case 'R':
5320       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5321         {
5322           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5323           return;
5324         }
5325       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5326       break;
5327
5328     case 'X':
5329       if (!CONST_INT_P (x))
5330         {
5331           output_operand_lossage ("invalid operand for '%%%c'", code);
5332           return;
5333         }
5334       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5335       break;
5336
5337     case 'w':
5338     case 'x':
5339       if (x == const0_rtx
5340           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5341         {
5342           asm_fprintf (f, "%czr", code);
5343           break;
5344         }
5345
5346       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5347         {
5348           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5349           break;
5350         }
5351
5352       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5353         {
5354           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5355           break;
5356         }
5357
5358       /* Fall through */
5359
5360     case 0:
5361       if (x == NULL)
5362         {
5363           output_operand_lossage ("missing operand");
5364           return;
5365         }
5366
5367       switch (GET_CODE (x))
5368         {
5369         case REG:
5370           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5371           break;
5372
5373         case MEM:
5374           output_address (GET_MODE (x), XEXP (x, 0));
5375           /* Check all memory references are Pmode - even with ILP32.  */
5376           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5377           break;
5378
5379         case CONST:
5380         case LABEL_REF:
5381         case SYMBOL_REF:
5382           output_addr_const (asm_out_file, x);
5383           break;
5384
5385         case CONST_INT:
5386           asm_fprintf (f, "%wd", INTVAL (x));
5387           break;
5388
5389         case CONST_VECTOR:
5390           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5391             {
5392               gcc_assert (
5393                   aarch64_const_vec_all_same_in_range_p (x,
5394                                                          HOST_WIDE_INT_MIN,
5395                                                          HOST_WIDE_INT_MAX));
5396               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5397             }
5398           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5399             {
5400               fputc ('0', f);
5401             }
5402           else
5403             gcc_unreachable ();
5404           break;
5405
5406         case CONST_DOUBLE:
5407           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5408              be getting CONST_DOUBLEs holding integers.  */
5409           gcc_assert (GET_MODE (x) != VOIDmode);
5410           if (aarch64_float_const_zero_rtx_p (x))
5411             {
5412               fputc ('0', f);
5413               break;
5414             }
5415           else if (aarch64_float_const_representable_p (x))
5416             {
5417 #define buf_size 20
5418               char float_buf[buf_size] = {'\0'};
5419               real_to_decimal_for_mode (float_buf,
5420                                         CONST_DOUBLE_REAL_VALUE (x),
5421                                         buf_size, buf_size,
5422                                         1, GET_MODE (x));
5423               asm_fprintf (asm_out_file, "%s", float_buf);
5424               break;
5425 #undef buf_size
5426             }
5427           output_operand_lossage ("invalid constant");
5428           return;
5429         default:
5430           output_operand_lossage ("invalid operand");
5431           return;
5432         }
5433       break;
5434
5435     case 'A':
5436       if (GET_CODE (x) == HIGH)
5437         x = XEXP (x, 0);
5438
5439       switch (aarch64_classify_symbolic_expression (x))
5440         {
5441         case SYMBOL_SMALL_GOT_4G:
5442           asm_fprintf (asm_out_file, ":got:");
5443           break;
5444
5445         case SYMBOL_SMALL_TLSGD:
5446           asm_fprintf (asm_out_file, ":tlsgd:");
5447           break;
5448
5449         case SYMBOL_SMALL_TLSDESC:
5450           asm_fprintf (asm_out_file, ":tlsdesc:");
5451           break;
5452
5453         case SYMBOL_SMALL_TLSIE:
5454           asm_fprintf (asm_out_file, ":gottprel:");
5455           break;
5456
5457         case SYMBOL_TLSLE24:
5458           asm_fprintf (asm_out_file, ":tprel:");
5459           break;
5460
5461         case SYMBOL_TINY_GOT:
5462           gcc_unreachable ();
5463           break;
5464
5465         default:
5466           break;
5467         }
5468       output_addr_const (asm_out_file, x);
5469       break;
5470
5471     case 'L':
5472       switch (aarch64_classify_symbolic_expression (x))
5473         {
5474         case SYMBOL_SMALL_GOT_4G:
5475           asm_fprintf (asm_out_file, ":lo12:");
5476           break;
5477
5478         case SYMBOL_SMALL_TLSGD:
5479           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5480           break;
5481
5482         case SYMBOL_SMALL_TLSDESC:
5483           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5484           break;
5485
5486         case SYMBOL_SMALL_TLSIE:
5487           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5488           break;
5489
5490         case SYMBOL_TLSLE12:
5491           asm_fprintf (asm_out_file, ":tprel_lo12:");
5492           break;
5493
5494         case SYMBOL_TLSLE24:
5495           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5496           break;
5497
5498         case SYMBOL_TINY_GOT:
5499           asm_fprintf (asm_out_file, ":got:");
5500           break;
5501
5502         case SYMBOL_TINY_TLSIE:
5503           asm_fprintf (asm_out_file, ":gottprel:");
5504           break;
5505
5506         default:
5507           break;
5508         }
5509       output_addr_const (asm_out_file, x);
5510       break;
5511
5512     case 'G':
5513       switch (aarch64_classify_symbolic_expression (x))
5514         {
5515         case SYMBOL_TLSLE24:
5516           asm_fprintf (asm_out_file, ":tprel_hi12:");
5517           break;
5518         default:
5519           break;
5520         }
5521       output_addr_const (asm_out_file, x);
5522       break;
5523
5524     case 'k':
5525       {
5526         HOST_WIDE_INT cond_code;
5527
5528         if (!CONST_INT_P (x))
5529           {
5530             output_operand_lossage ("invalid operand for '%%%c'", code);
5531             return;
5532           }
5533
5534         cond_code = INTVAL (x);
5535         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5536         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5537       }
5538       break;
5539
5540     default:
5541       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5542       return;
5543     }
5544 }
5545
5546 static void
5547 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5548 {
5549   struct aarch64_address_info addr;
5550
5551   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5552     switch (addr.type)
5553       {
5554       case ADDRESS_REG_IMM:
5555         if (addr.offset == const0_rtx)
5556           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5557         else
5558           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5559                        INTVAL (addr.offset));
5560         return;
5561
5562       case ADDRESS_REG_REG:
5563         if (addr.shift == 0)
5564           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5565                        reg_names [REGNO (addr.offset)]);
5566         else
5567           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5568                        reg_names [REGNO (addr.offset)], addr.shift);
5569         return;
5570
5571       case ADDRESS_REG_UXTW:
5572         if (addr.shift == 0)
5573           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5574                        REGNO (addr.offset) - R0_REGNUM);
5575         else
5576           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5577                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5578         return;
5579
5580       case ADDRESS_REG_SXTW:
5581         if (addr.shift == 0)
5582           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5583                        REGNO (addr.offset) - R0_REGNUM);
5584         else
5585           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5586                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5587         return;
5588
5589       case ADDRESS_REG_WB:
5590         switch (GET_CODE (x))
5591           {
5592           case PRE_INC:
5593             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5594                          GET_MODE_SIZE (mode));
5595             return;
5596           case POST_INC:
5597             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5598                          GET_MODE_SIZE (mode));
5599             return;
5600           case PRE_DEC:
5601             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5602                          GET_MODE_SIZE (mode));
5603             return;
5604           case POST_DEC:
5605             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5606                          GET_MODE_SIZE (mode));
5607             return;
5608           case PRE_MODIFY:
5609             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5610                          INTVAL (addr.offset));
5611             return;
5612           case POST_MODIFY:
5613             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5614                          INTVAL (addr.offset));
5615             return;
5616           default:
5617             break;
5618           }
5619         break;
5620
5621       case ADDRESS_LO_SUM:
5622         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5623         output_addr_const (f, addr.offset);
5624         asm_fprintf (f, "]");
5625         return;
5626
5627       case ADDRESS_SYMBOLIC:
5628         break;
5629       }
5630
5631   output_addr_const (f, x);
5632 }
5633
5634 bool
5635 aarch64_label_mentioned_p (rtx x)
5636 {
5637   const char *fmt;
5638   int i;
5639
5640   if (GET_CODE (x) == LABEL_REF)
5641     return true;
5642
5643   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5644      referencing instruction, but they are constant offsets, not
5645      symbols.  */
5646   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5647     return false;
5648
5649   fmt = GET_RTX_FORMAT (GET_CODE (x));
5650   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5651     {
5652       if (fmt[i] == 'E')
5653         {
5654           int j;
5655
5656           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5657             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5658               return 1;
5659         }
5660       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5661         return 1;
5662     }
5663
5664   return 0;
5665 }
5666
5667 /* Implement REGNO_REG_CLASS.  */
5668
5669 enum reg_class
5670 aarch64_regno_regclass (unsigned regno)
5671 {
5672   if (GP_REGNUM_P (regno))
5673     return GENERAL_REGS;
5674
5675   if (regno == SP_REGNUM)
5676     return STACK_REG;
5677
5678   if (regno == FRAME_POINTER_REGNUM
5679       || regno == ARG_POINTER_REGNUM)
5680     return POINTER_REGS;
5681
5682   if (FP_REGNUM_P (regno))
5683     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5684
5685   return NO_REGS;
5686 }
5687
5688 static rtx
5689 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5690 {
5691   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5692      where mask is selected by alignment and size of the offset.
5693      We try to pick as large a range for the offset as possible to
5694      maximize the chance of a CSE.  However, for aligned addresses
5695      we limit the range to 4k so that structures with different sized
5696      elements are likely to use the same base.  We need to be careful
5697      not to split a CONST for some forms of address expression, otherwise
5698      it will generate sub-optimal code.  */
5699
5700   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5701     {
5702       rtx base = XEXP (x, 0);
5703       rtx offset_rtx = XEXP (x, 1);
5704       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5705
5706       if (GET_CODE (base) == PLUS)
5707         {
5708           rtx op0 = XEXP (base, 0);
5709           rtx op1 = XEXP (base, 1);
5710
5711           /* Force any scaling into a temp for CSE.  */
5712           op0 = force_reg (Pmode, op0);
5713           op1 = force_reg (Pmode, op1);
5714
5715           /* Let the pointer register be in op0.  */
5716           if (REG_POINTER (op1))
5717             std::swap (op0, op1);
5718
5719           /* If the pointer is virtual or frame related, then we know that
5720              virtual register instantiation or register elimination is going
5721              to apply a second constant.  We want the two constants folded
5722              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5723           if (virt_or_elim_regno_p (REGNO (op0)))
5724             {
5725               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5726                                    NULL_RTX, true, OPTAB_DIRECT);
5727               return gen_rtx_PLUS (Pmode, base, op1);
5728             }
5729
5730           /* Otherwise, in order to encourage CSE (and thence loop strength
5731              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5732           base = expand_binop (Pmode, add_optab, op0, op1,
5733                                NULL_RTX, true, OPTAB_DIRECT);
5734           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5735         }
5736
5737       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5738       HOST_WIDE_INT base_offset;
5739       if (GET_MODE_SIZE (mode) > 16)
5740         base_offset = (offset + 0x400) & ~0x7f0;
5741       /* For offsets aren't a multiple of the access size, the limit is
5742          -256...255.  */
5743       else if (offset & (GET_MODE_SIZE (mode) - 1))
5744         {
5745           base_offset = (offset + 0x100) & ~0x1ff;
5746
5747           /* BLKmode typically uses LDP of X-registers.  */
5748           if (mode == BLKmode)
5749             base_offset = (offset + 512) & ~0x3ff;
5750         }
5751       /* Small negative offsets are supported.  */
5752       else if (IN_RANGE (offset, -256, 0))
5753         base_offset = 0;
5754       else if (mode == TImode || mode == TFmode)
5755         base_offset = (offset + 0x100) & ~0x1ff;
5756       /* Use 12-bit offset by access size.  */
5757       else
5758         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5759
5760       if (base_offset != 0)
5761         {
5762           base = plus_constant (Pmode, base, base_offset);
5763           base = force_operand (base, NULL_RTX);
5764           return plus_constant (Pmode, base, offset - base_offset);
5765         }
5766     }
5767
5768   return x;
5769 }
5770
5771 /* Return the reload icode required for a constant pool in mode.  */
5772 static enum insn_code
5773 aarch64_constant_pool_reload_icode (machine_mode mode)
5774 {
5775   switch (mode)
5776     {
5777     case SFmode:
5778       return CODE_FOR_aarch64_reload_movcpsfdi;
5779
5780     case DFmode:
5781       return CODE_FOR_aarch64_reload_movcpdfdi;
5782
5783     case TFmode:
5784       return CODE_FOR_aarch64_reload_movcptfdi;
5785
5786     case V8QImode:
5787       return CODE_FOR_aarch64_reload_movcpv8qidi;
5788
5789     case V16QImode:
5790       return CODE_FOR_aarch64_reload_movcpv16qidi;
5791
5792     case V4HImode:
5793       return CODE_FOR_aarch64_reload_movcpv4hidi;
5794
5795     case V8HImode:
5796       return CODE_FOR_aarch64_reload_movcpv8hidi;
5797
5798     case V2SImode:
5799       return CODE_FOR_aarch64_reload_movcpv2sidi;
5800
5801     case V4SImode:
5802       return CODE_FOR_aarch64_reload_movcpv4sidi;
5803
5804     case V2DImode:
5805       return CODE_FOR_aarch64_reload_movcpv2didi;
5806
5807     case V2DFmode:
5808       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5809
5810     default:
5811       gcc_unreachable ();
5812     }
5813
5814   gcc_unreachable ();
5815 }
5816 static reg_class_t
5817 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5818                           reg_class_t rclass,
5819                           machine_mode mode,
5820                           secondary_reload_info *sri)
5821 {
5822
5823   /* If we have to disable direct literal pool loads and stores because the
5824      function is too big, then we need a scratch register.  */
5825   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5826       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5827           || targetm.vector_mode_supported_p (GET_MODE (x)))
5828       && !aarch64_pcrelative_literal_loads)
5829     {
5830       sri->icode = aarch64_constant_pool_reload_icode (mode);
5831       return NO_REGS;
5832     }
5833
5834   /* Without the TARGET_SIMD instructions we cannot move a Q register
5835      to a Q register directly.  We need a scratch.  */
5836   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5837       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5838       && reg_class_subset_p (rclass, FP_REGS))
5839     {
5840       if (mode == TFmode)
5841         sri->icode = CODE_FOR_aarch64_reload_movtf;
5842       else if (mode == TImode)
5843         sri->icode = CODE_FOR_aarch64_reload_movti;
5844       return NO_REGS;
5845     }
5846
5847   /* A TFmode or TImode memory access should be handled via an FP_REGS
5848      because AArch64 has richer addressing modes for LDR/STR instructions
5849      than LDP/STP instructions.  */
5850   if (TARGET_FLOAT && rclass == GENERAL_REGS
5851       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5852     return FP_REGS;
5853
5854   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5855       return GENERAL_REGS;
5856
5857   return NO_REGS;
5858 }
5859
5860 static bool
5861 aarch64_can_eliminate (const int from, const int to)
5862 {
5863   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5864      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5865
5866   if (frame_pointer_needed)
5867     {
5868       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5869         return true;
5870       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5871         return false;
5872       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5873           && !cfun->calls_alloca)
5874         return true;
5875       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5876         return true;
5877
5878       return false;
5879     }
5880   else
5881     {
5882       /* If we decided that we didn't need a leaf frame pointer but then used
5883          LR in the function, then we'll want a frame pointer after all, so
5884          prevent this elimination to ensure a frame pointer is used.  */
5885       if (to == STACK_POINTER_REGNUM
5886           && flag_omit_leaf_frame_pointer
5887           && df_regs_ever_live_p (LR_REGNUM))
5888         return false;
5889     }
5890
5891   return true;
5892 }
5893
5894 HOST_WIDE_INT
5895 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5896 {
5897   aarch64_layout_frame ();
5898
5899   if (to == HARD_FRAME_POINTER_REGNUM)
5900     {
5901       if (from == ARG_POINTER_REGNUM)
5902         return cfun->machine->frame.hard_fp_offset;
5903
5904       if (from == FRAME_POINTER_REGNUM)
5905         return cfun->machine->frame.hard_fp_offset
5906                - cfun->machine->frame.locals_offset;
5907     }
5908
5909   if (to == STACK_POINTER_REGNUM)
5910     {
5911       if (from == FRAME_POINTER_REGNUM)
5912           return cfun->machine->frame.frame_size
5913                  - cfun->machine->frame.locals_offset;
5914     }
5915
5916   return cfun->machine->frame.frame_size;
5917 }
5918
5919 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5920    previous frame.  */
5921
5922 rtx
5923 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5924 {
5925   if (count != 0)
5926     return const0_rtx;
5927   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5928 }
5929
5930
5931 static void
5932 aarch64_asm_trampoline_template (FILE *f)
5933 {
5934   if (TARGET_ILP32)
5935     {
5936       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5937       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5938     }
5939   else
5940     {
5941       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5942       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5943     }
5944   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5945   assemble_aligned_integer (4, const0_rtx);
5946   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5947   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5948 }
5949
5950 static void
5951 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5952 {
5953   rtx fnaddr, mem, a_tramp;
5954   const int tramp_code_sz = 16;
5955
5956   /* Don't need to copy the trailing D-words, we fill those in below.  */
5957   emit_block_move (m_tramp, assemble_trampoline_template (),
5958                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5959   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5960   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5961   if (GET_MODE (fnaddr) != ptr_mode)
5962     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5963   emit_move_insn (mem, fnaddr);
5964
5965   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5966   emit_move_insn (mem, chain_value);
5967
5968   /* XXX We should really define a "clear_cache" pattern and use
5969      gen_clear_cache().  */
5970   a_tramp = XEXP (m_tramp, 0);
5971   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5972                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5973                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5974                      ptr_mode);
5975 }
5976
5977 static unsigned char
5978 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5979 {
5980   switch (regclass)
5981     {
5982     case CALLER_SAVE_REGS:
5983     case POINTER_REGS:
5984     case GENERAL_REGS:
5985     case ALL_REGS:
5986     case FP_REGS:
5987     case FP_LO_REGS:
5988       return
5989         aarch64_vector_mode_p (mode)
5990           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5991           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5992     case STACK_REG:
5993       return 1;
5994
5995     case NO_REGS:
5996       return 0;
5997
5998     default:
5999       break;
6000     }
6001   gcc_unreachable ();
6002 }
6003
6004 static reg_class_t
6005 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6006 {
6007   if (regclass == POINTER_REGS)
6008     return GENERAL_REGS;
6009
6010   if (regclass == STACK_REG)
6011     {
6012       if (REG_P(x)
6013           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6014           return regclass;
6015
6016       return NO_REGS;
6017     }
6018
6019   /* Register eliminiation can result in a request for
6020      SP+constant->FP_REGS.  We cannot support such operations which
6021      use SP as source and an FP_REG as destination, so reject out
6022      right now.  */
6023   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6024     {
6025       rtx lhs = XEXP (x, 0);
6026
6027       /* Look through a possible SUBREG introduced by ILP32.  */
6028       if (GET_CODE (lhs) == SUBREG)
6029         lhs = SUBREG_REG (lhs);
6030
6031       gcc_assert (REG_P (lhs));
6032       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6033                                       POINTER_REGS));
6034       return NO_REGS;
6035     }
6036
6037   return regclass;
6038 }
6039
6040 void
6041 aarch64_asm_output_labelref (FILE* f, const char *name)
6042 {
6043   asm_fprintf (f, "%U%s", name);
6044 }
6045
6046 static void
6047 aarch64_elf_asm_constructor (rtx symbol, int priority)
6048 {
6049   if (priority == DEFAULT_INIT_PRIORITY)
6050     default_ctor_section_asm_out_constructor (symbol, priority);
6051   else
6052     {
6053       section *s;
6054       /* While priority is known to be in range [0, 65535], so 18 bytes
6055          would be enough, the compiler might not know that.  To avoid
6056          -Wformat-truncation false positive, use a larger size.  */
6057       char buf[23];
6058       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6059       s = get_section (buf, SECTION_WRITE, NULL);
6060       switch_to_section (s);
6061       assemble_align (POINTER_SIZE);
6062       assemble_aligned_integer (POINTER_BYTES, symbol);
6063     }
6064 }
6065
6066 static void
6067 aarch64_elf_asm_destructor (rtx symbol, int priority)
6068 {
6069   if (priority == DEFAULT_INIT_PRIORITY)
6070     default_dtor_section_asm_out_destructor (symbol, priority);
6071   else
6072     {
6073       section *s;
6074       /* While priority is known to be in range [0, 65535], so 18 bytes
6075          would be enough, the compiler might not know that.  To avoid
6076          -Wformat-truncation false positive, use a larger size.  */
6077       char buf[23];
6078       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6079       s = get_section (buf, SECTION_WRITE, NULL);
6080       switch_to_section (s);
6081       assemble_align (POINTER_SIZE);
6082       assemble_aligned_integer (POINTER_BYTES, symbol);
6083     }
6084 }
6085
6086 const char*
6087 aarch64_output_casesi (rtx *operands)
6088 {
6089   char buf[100];
6090   char label[100];
6091   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6092   int index;
6093   static const char *const patterns[4][2] =
6094   {
6095     {
6096       "ldrb\t%w3, [%0,%w1,uxtw]",
6097       "add\t%3, %4, %w3, sxtb #2"
6098     },
6099     {
6100       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6101       "add\t%3, %4, %w3, sxth #2"
6102     },
6103     {
6104       "ldr\t%w3, [%0,%w1,uxtw #2]",
6105       "add\t%3, %4, %w3, sxtw #2"
6106     },
6107     /* We assume that DImode is only generated when not optimizing and
6108        that we don't really need 64-bit address offsets.  That would
6109        imply an object file with 8GB of code in a single function!  */
6110     {
6111       "ldr\t%w3, [%0,%w1,uxtw #2]",
6112       "add\t%3, %4, %w3, sxtw #2"
6113     }
6114   };
6115
6116   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6117
6118   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6119
6120   gcc_assert (index >= 0 && index <= 3);
6121
6122   /* Need to implement table size reduction, by chaning the code below.  */
6123   output_asm_insn (patterns[index][0], operands);
6124   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6125   snprintf (buf, sizeof (buf),
6126             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6127   output_asm_insn (buf, operands);
6128   output_asm_insn (patterns[index][1], operands);
6129   output_asm_insn ("br\t%3", operands);
6130   assemble_label (asm_out_file, label);
6131   return "";
6132 }
6133
6134
6135 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6136    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6137    operator.  */
6138
6139 int
6140 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6141 {
6142   if (shift >= 0 && shift <= 3)
6143     {
6144       int size;
6145       for (size = 8; size <= 32; size *= 2)
6146         {
6147           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6148           if (mask == bits << shift)
6149             return size;
6150         }
6151     }
6152   return 0;
6153 }
6154
6155 /* Constant pools are per function only when PC relative
6156    literal loads are true or we are in the large memory
6157    model.  */
6158
6159 static inline bool
6160 aarch64_can_use_per_function_literal_pools_p (void)
6161 {
6162   return (aarch64_pcrelative_literal_loads
6163           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6164 }
6165
6166 static bool
6167 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6168 {
6169   /* Fixme:: In an ideal world this would work similar
6170      to the logic in aarch64_select_rtx_section but this
6171      breaks bootstrap in gcc go.  For now we workaround
6172      this by returning false here.  */
6173   return false;
6174 }
6175
6176 /* Select appropriate section for constants depending
6177    on where we place literal pools.  */
6178
6179 static section *
6180 aarch64_select_rtx_section (machine_mode mode,
6181                             rtx x,
6182                             unsigned HOST_WIDE_INT align)
6183 {
6184   if (aarch64_can_use_per_function_literal_pools_p ())
6185     return function_section (current_function_decl);
6186
6187   return default_elf_select_rtx_section (mode, x, align);
6188 }
6189
6190 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6191 void
6192 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6193                                   HOST_WIDE_INT offset)
6194 {
6195   /* When using per-function literal pools, we must ensure that any code
6196      section is aligned to the minimal instruction length, lest we get
6197      errors from the assembler re "unaligned instructions".  */
6198   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6199     ASM_OUTPUT_ALIGN (f, 2);
6200 }
6201
6202 /* Costs.  */
6203
6204 /* Helper function for rtx cost calculation.  Strip a shift expression
6205    from X.  Returns the inner operand if successful, or the original
6206    expression on failure.  */
6207 static rtx
6208 aarch64_strip_shift (rtx x)
6209 {
6210   rtx op = x;
6211
6212   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6213      we can convert both to ROR during final output.  */
6214   if ((GET_CODE (op) == ASHIFT
6215        || GET_CODE (op) == ASHIFTRT
6216        || GET_CODE (op) == LSHIFTRT
6217        || GET_CODE (op) == ROTATERT
6218        || GET_CODE (op) == ROTATE)
6219       && CONST_INT_P (XEXP (op, 1)))
6220     return XEXP (op, 0);
6221
6222   if (GET_CODE (op) == MULT
6223       && CONST_INT_P (XEXP (op, 1))
6224       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6225     return XEXP (op, 0);
6226
6227   return x;
6228 }
6229
6230 /* Helper function for rtx cost calculation.  Strip an extend
6231    expression from X.  Returns the inner operand if successful, or the
6232    original expression on failure.  We deal with a number of possible
6233    canonicalization variations here. If STRIP_SHIFT is true, then
6234    we can strip off a shift also.  */
6235 static rtx
6236 aarch64_strip_extend (rtx x, bool strip_shift)
6237 {
6238   rtx op = x;
6239
6240   /* Zero and sign extraction of a widened value.  */
6241   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6242       && XEXP (op, 2) == const0_rtx
6243       && GET_CODE (XEXP (op, 0)) == MULT
6244       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6245                                          XEXP (op, 1)))
6246     return XEXP (XEXP (op, 0), 0);
6247
6248   /* It can also be represented (for zero-extend) as an AND with an
6249      immediate.  */
6250   if (GET_CODE (op) == AND
6251       && GET_CODE (XEXP (op, 0)) == MULT
6252       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6253       && CONST_INT_P (XEXP (op, 1))
6254       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6255                            INTVAL (XEXP (op, 1))) != 0)
6256     return XEXP (XEXP (op, 0), 0);
6257
6258   /* Now handle extended register, as this may also have an optional
6259      left shift by 1..4.  */
6260   if (strip_shift
6261       && GET_CODE (op) == ASHIFT
6262       && CONST_INT_P (XEXP (op, 1))
6263       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6264     op = XEXP (op, 0);
6265
6266   if (GET_CODE (op) == ZERO_EXTEND
6267       || GET_CODE (op) == SIGN_EXTEND)
6268     op = XEXP (op, 0);
6269
6270   if (op != x)
6271     return op;
6272
6273   return x;
6274 }
6275
6276 /* Return true iff CODE is a shift supported in combination
6277    with arithmetic instructions.  */
6278
6279 static bool
6280 aarch64_shift_p (enum rtx_code code)
6281 {
6282   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6283 }
6284
6285
6286 /* Return true iff X is a cheap shift without a sign extend. */
6287
6288 static bool
6289 aarch64_cheap_mult_shift_p (rtx x)
6290 {
6291   rtx op0, op1;
6292
6293   op0 = XEXP (x, 0);
6294   op1 = XEXP (x, 1);
6295
6296   if (!(aarch64_tune_params.extra_tuning_flags
6297                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6298     return false;
6299
6300   if (GET_CODE (op0) == SIGN_EXTEND)
6301     return false;
6302
6303   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6304       && UINTVAL (op1) <= 4)
6305     return true;
6306
6307   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6308     return false;
6309
6310   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6311
6312   if (l2 > 0 && l2 <= 4)
6313     return true;
6314
6315   return false;
6316 }
6317
6318 /* Helper function for rtx cost calculation.  Calculate the cost of
6319    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6320    Return the calculated cost of the expression, recursing manually in to
6321    operands where needed.  */
6322
6323 static int
6324 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6325 {
6326   rtx op0, op1;
6327   const struct cpu_cost_table *extra_cost
6328     = aarch64_tune_params.insn_extra_cost;
6329   int cost = 0;
6330   bool compound_p = (outer == PLUS || outer == MINUS);
6331   machine_mode mode = GET_MODE (x);
6332
6333   gcc_checking_assert (code == MULT);
6334
6335   op0 = XEXP (x, 0);
6336   op1 = XEXP (x, 1);
6337
6338   if (VECTOR_MODE_P (mode))
6339     mode = GET_MODE_INNER (mode);
6340
6341   /* Integer multiply/fma.  */
6342   if (GET_MODE_CLASS (mode) == MODE_INT)
6343     {
6344       /* The multiply will be canonicalized as a shift, cost it as such.  */
6345       if (aarch64_shift_p (GET_CODE (x))
6346           || (CONST_INT_P (op1)
6347               && exact_log2 (INTVAL (op1)) > 0))
6348         {
6349           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6350                            || GET_CODE (op0) == SIGN_EXTEND;
6351           if (speed)
6352             {
6353               if (compound_p)
6354                 {
6355                   /* If the shift is considered cheap,
6356                      then don't add any cost. */
6357                   if (aarch64_cheap_mult_shift_p (x))
6358                     ;
6359                   else if (REG_P (op1))
6360                     /* ARITH + shift-by-register.  */
6361                     cost += extra_cost->alu.arith_shift_reg;
6362                   else if (is_extend)
6363                     /* ARITH + extended register.  We don't have a cost field
6364                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6365                     cost += extra_cost->alu.extend_arith;
6366                   else
6367                     /* ARITH + shift-by-immediate.  */
6368                     cost += extra_cost->alu.arith_shift;
6369                 }
6370               else
6371                 /* LSL (immediate).  */
6372                 cost += extra_cost->alu.shift;
6373
6374             }
6375           /* Strip extends as we will have costed them in the case above.  */
6376           if (is_extend)
6377             op0 = aarch64_strip_extend (op0, true);
6378
6379           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6380
6381           return cost;
6382         }
6383
6384       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6385          compound and let the below cases handle it.  After all, MNEG is a
6386          special-case alias of MSUB.  */
6387       if (GET_CODE (op0) == NEG)
6388         {
6389           op0 = XEXP (op0, 0);
6390           compound_p = true;
6391         }
6392
6393       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6394       if ((GET_CODE (op0) == ZERO_EXTEND
6395            && GET_CODE (op1) == ZERO_EXTEND)
6396           || (GET_CODE (op0) == SIGN_EXTEND
6397               && GET_CODE (op1) == SIGN_EXTEND))
6398         {
6399           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6400           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6401
6402           if (speed)
6403             {
6404               if (compound_p)
6405                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6406                 cost += extra_cost->mult[0].extend_add;
6407               else
6408                 /* MUL/SMULL/UMULL.  */
6409                 cost += extra_cost->mult[0].extend;
6410             }
6411
6412           return cost;
6413         }
6414
6415       /* This is either an integer multiply or a MADD.  In both cases
6416          we want to recurse and cost the operands.  */
6417       cost += rtx_cost (op0, mode, MULT, 0, speed);
6418       cost += rtx_cost (op1, mode, MULT, 1, speed);
6419
6420       if (speed)
6421         {
6422           if (compound_p)
6423             /* MADD/MSUB.  */
6424             cost += extra_cost->mult[mode == DImode].add;
6425           else
6426             /* MUL.  */
6427             cost += extra_cost->mult[mode == DImode].simple;
6428         }
6429
6430       return cost;
6431     }
6432   else
6433     {
6434       if (speed)
6435         {
6436           /* Floating-point FMA/FMUL can also support negations of the
6437              operands, unless the rounding mode is upward or downward in
6438              which case FNMUL is different than FMUL with operand negation.  */
6439           bool neg0 = GET_CODE (op0) == NEG;
6440           bool neg1 = GET_CODE (op1) == NEG;
6441           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6442             {
6443               if (neg0)
6444                 op0 = XEXP (op0, 0);
6445               if (neg1)
6446                 op1 = XEXP (op1, 0);
6447             }
6448
6449           if (compound_p)
6450             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6451             cost += extra_cost->fp[mode == DFmode].fma;
6452           else
6453             /* FMUL/FNMUL.  */
6454             cost += extra_cost->fp[mode == DFmode].mult;
6455         }
6456
6457       cost += rtx_cost (op0, mode, MULT, 0, speed);
6458       cost += rtx_cost (op1, mode, MULT, 1, speed);
6459       return cost;
6460     }
6461 }
6462
6463 static int
6464 aarch64_address_cost (rtx x,
6465                       machine_mode mode,
6466                       addr_space_t as ATTRIBUTE_UNUSED,
6467                       bool speed)
6468 {
6469   enum rtx_code c = GET_CODE (x);
6470   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6471   struct aarch64_address_info info;
6472   int cost = 0;
6473   info.shift = 0;
6474
6475   if (!aarch64_classify_address (&info, x, mode, c, false))
6476     {
6477       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6478         {
6479           /* This is a CONST or SYMBOL ref which will be split
6480              in a different way depending on the code model in use.
6481              Cost it through the generic infrastructure.  */
6482           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6483           /* Divide through by the cost of one instruction to
6484              bring it to the same units as the address costs.  */
6485           cost_symbol_ref /= COSTS_N_INSNS (1);
6486           /* The cost is then the cost of preparing the address,
6487              followed by an immediate (possibly 0) offset.  */
6488           return cost_symbol_ref + addr_cost->imm_offset;
6489         }
6490       else
6491         {
6492           /* This is most likely a jump table from a case
6493              statement.  */
6494           return addr_cost->register_offset;
6495         }
6496     }
6497
6498   switch (info.type)
6499     {
6500       case ADDRESS_LO_SUM:
6501       case ADDRESS_SYMBOLIC:
6502       case ADDRESS_REG_IMM:
6503         cost += addr_cost->imm_offset;
6504         break;
6505
6506       case ADDRESS_REG_WB:
6507         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6508           cost += addr_cost->pre_modify;
6509         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6510           cost += addr_cost->post_modify;
6511         else
6512           gcc_unreachable ();
6513
6514         break;
6515
6516       case ADDRESS_REG_REG:
6517         cost += addr_cost->register_offset;
6518         break;
6519
6520       case ADDRESS_REG_SXTW:
6521         cost += addr_cost->register_sextend;
6522         break;
6523
6524       case ADDRESS_REG_UXTW:
6525         cost += addr_cost->register_zextend;
6526         break;
6527
6528       default:
6529         gcc_unreachable ();
6530     }
6531
6532
6533   if (info.shift > 0)
6534     {
6535       /* For the sake of calculating the cost of the shifted register
6536          component, we can treat same sized modes in the same way.  */
6537       switch (GET_MODE_BITSIZE (mode))
6538         {
6539           case 16:
6540             cost += addr_cost->addr_scale_costs.hi;
6541             break;
6542
6543           case 32:
6544             cost += addr_cost->addr_scale_costs.si;
6545             break;
6546
6547           case 64:
6548             cost += addr_cost->addr_scale_costs.di;
6549             break;
6550
6551           /* We can't tell, or this is a 128-bit vector.  */
6552           default:
6553             cost += addr_cost->addr_scale_costs.ti;
6554             break;
6555         }
6556     }
6557
6558   return cost;
6559 }
6560
6561 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6562    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6563    to be taken.  */
6564
6565 int
6566 aarch64_branch_cost (bool speed_p, bool predictable_p)
6567 {
6568   /* When optimizing for speed, use the cost of unpredictable branches.  */
6569   const struct cpu_branch_cost *branch_costs =
6570     aarch64_tune_params.branch_costs;
6571
6572   if (!speed_p || predictable_p)
6573     return branch_costs->predictable;
6574   else
6575     return branch_costs->unpredictable;
6576 }
6577
6578 /* Return true if the RTX X in mode MODE is a zero or sign extract
6579    usable in an ADD or SUB (extended register) instruction.  */
6580 static bool
6581 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6582 {
6583   /* Catch add with a sign extract.
6584      This is add_<optab><mode>_multp2.  */
6585   if (GET_CODE (x) == SIGN_EXTRACT
6586       || GET_CODE (x) == ZERO_EXTRACT)
6587     {
6588       rtx op0 = XEXP (x, 0);
6589       rtx op1 = XEXP (x, 1);
6590       rtx op2 = XEXP (x, 2);
6591
6592       if (GET_CODE (op0) == MULT
6593           && CONST_INT_P (op1)
6594           && op2 == const0_rtx
6595           && CONST_INT_P (XEXP (op0, 1))
6596           && aarch64_is_extend_from_extract (mode,
6597                                              XEXP (op0, 1),
6598                                              op1))
6599         {
6600           return true;
6601         }
6602     }
6603   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6604      No shift.  */
6605   else if (GET_CODE (x) == SIGN_EXTEND
6606            || GET_CODE (x) == ZERO_EXTEND)
6607     return REG_P (XEXP (x, 0));
6608
6609   return false;
6610 }
6611
6612 static bool
6613 aarch64_frint_unspec_p (unsigned int u)
6614 {
6615   switch (u)
6616     {
6617       case UNSPEC_FRINTZ:
6618       case UNSPEC_FRINTP:
6619       case UNSPEC_FRINTM:
6620       case UNSPEC_FRINTA:
6621       case UNSPEC_FRINTN:
6622       case UNSPEC_FRINTX:
6623       case UNSPEC_FRINTI:
6624         return true;
6625
6626       default:
6627         return false;
6628     }
6629 }
6630
6631 /* Return true iff X is an rtx that will match an extr instruction
6632    i.e. as described in the *extr<mode>5_insn family of patterns.
6633    OP0 and OP1 will be set to the operands of the shifts involved
6634    on success and will be NULL_RTX otherwise.  */
6635
6636 static bool
6637 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6638 {
6639   rtx op0, op1;
6640   machine_mode mode = GET_MODE (x);
6641
6642   *res_op0 = NULL_RTX;
6643   *res_op1 = NULL_RTX;
6644
6645   if (GET_CODE (x) != IOR)
6646     return false;
6647
6648   op0 = XEXP (x, 0);
6649   op1 = XEXP (x, 1);
6650
6651   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6652       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6653     {
6654      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6655       if (GET_CODE (op1) == ASHIFT)
6656         std::swap (op0, op1);
6657
6658       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6659         return false;
6660
6661       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6662       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6663
6664       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6665           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6666         {
6667           *res_op0 = XEXP (op0, 0);
6668           *res_op1 = XEXP (op1, 0);
6669           return true;
6670         }
6671     }
6672
6673   return false;
6674 }
6675
6676 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6677    storing it in *COST.  Result is true if the total cost of the operation
6678    has now been calculated.  */
6679 static bool
6680 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6681 {
6682   rtx inner;
6683   rtx comparator;
6684   enum rtx_code cmpcode;
6685
6686   if (COMPARISON_P (op0))
6687     {
6688       inner = XEXP (op0, 0);
6689       comparator = XEXP (op0, 1);
6690       cmpcode = GET_CODE (op0);
6691     }
6692   else
6693     {
6694       inner = op0;
6695       comparator = const0_rtx;
6696       cmpcode = NE;
6697     }
6698
6699   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6700     {
6701       /* Conditional branch.  */
6702       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6703         return true;
6704       else
6705         {
6706           if (cmpcode == NE || cmpcode == EQ)
6707             {
6708               if (comparator == const0_rtx)
6709                 {
6710                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6711                   if (GET_CODE (inner) == ZERO_EXTRACT)
6712                     /* TBZ/TBNZ.  */
6713                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6714                                        ZERO_EXTRACT, 0, speed);
6715                   else
6716                     /* CBZ/CBNZ.  */
6717                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6718
6719                 return true;
6720               }
6721             }
6722           else if (cmpcode == LT || cmpcode == GE)
6723             {
6724               /* TBZ/TBNZ.  */
6725               if (comparator == const0_rtx)
6726                 return true;
6727             }
6728         }
6729     }
6730   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6731     {
6732       /* CCMP.  */
6733       if (GET_CODE (op1) == COMPARE)
6734         {
6735           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6736           if (XEXP (op1, 1) == const0_rtx)
6737             *cost += 1;
6738           if (speed)
6739             {
6740               machine_mode mode = GET_MODE (XEXP (op1, 0));
6741               const struct cpu_cost_table *extra_cost
6742                 = aarch64_tune_params.insn_extra_cost;
6743
6744               if (GET_MODE_CLASS (mode) == MODE_INT)
6745                 *cost += extra_cost->alu.arith;
6746               else
6747                 *cost += extra_cost->fp[mode == DFmode].compare;
6748             }
6749           return true;
6750         }
6751
6752       /* It's a conditional operation based on the status flags,
6753          so it must be some flavor of CSEL.  */
6754
6755       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6756       if (GET_CODE (op1) == NEG
6757           || GET_CODE (op1) == NOT
6758           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6759         op1 = XEXP (op1, 0);
6760       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6761         {
6762           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6763           op1 = XEXP (op1, 0);
6764           op2 = XEXP (op2, 0);
6765         }
6766
6767       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6768       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6769       return true;
6770     }
6771
6772   /* We don't know what this is, cost all operands.  */
6773   return false;
6774 }
6775
6776 /* Check whether X is a bitfield operation of the form shift + extend that
6777    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6778    operand to which the bitfield operation is applied.  Otherwise return
6779    NULL_RTX.  */
6780
6781 static rtx
6782 aarch64_extend_bitfield_pattern_p (rtx x)
6783 {
6784   rtx_code outer_code = GET_CODE (x);
6785   machine_mode outer_mode = GET_MODE (x);
6786
6787   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6788       && outer_mode != SImode && outer_mode != DImode)
6789     return NULL_RTX;
6790
6791   rtx inner = XEXP (x, 0);
6792   rtx_code inner_code = GET_CODE (inner);
6793   machine_mode inner_mode = GET_MODE (inner);
6794   rtx op = NULL_RTX;
6795
6796   switch (inner_code)
6797     {
6798       case ASHIFT:
6799         if (CONST_INT_P (XEXP (inner, 1))
6800             && (inner_mode == QImode || inner_mode == HImode))
6801           op = XEXP (inner, 0);
6802         break;
6803       case LSHIFTRT:
6804         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6805             && (inner_mode == QImode || inner_mode == HImode))
6806           op = XEXP (inner, 0);
6807         break;
6808       case ASHIFTRT:
6809         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6810             && (inner_mode == QImode || inner_mode == HImode))
6811           op = XEXP (inner, 0);
6812         break;
6813       default:
6814         break;
6815     }
6816
6817   return op;
6818 }
6819
6820 /* Return true if the mask and a shift amount from an RTX of the form
6821    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6822    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6823
6824 bool
6825 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6826 {
6827   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6828          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6829          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6830          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6831 }
6832
6833 /* Calculate the cost of calculating X, storing it in *COST.  Result
6834    is true if the total cost of the operation has now been calculated.  */
6835 static bool
6836 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6837                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6838 {
6839   rtx op0, op1, op2;
6840   const struct cpu_cost_table *extra_cost
6841     = aarch64_tune_params.insn_extra_cost;
6842   int code = GET_CODE (x);
6843
6844   /* By default, assume that everything has equivalent cost to the
6845      cheapest instruction.  Any additional costs are applied as a delta
6846      above this default.  */
6847   *cost = COSTS_N_INSNS (1);
6848
6849   switch (code)
6850     {
6851     case SET:
6852       /* The cost depends entirely on the operands to SET.  */
6853       *cost = 0;
6854       op0 = SET_DEST (x);
6855       op1 = SET_SRC (x);
6856
6857       switch (GET_CODE (op0))
6858         {
6859         case MEM:
6860           if (speed)
6861             {
6862               rtx address = XEXP (op0, 0);
6863               if (VECTOR_MODE_P (mode))
6864                 *cost += extra_cost->ldst.storev;
6865               else if (GET_MODE_CLASS (mode) == MODE_INT)
6866                 *cost += extra_cost->ldst.store;
6867               else if (mode == SFmode)
6868                 *cost += extra_cost->ldst.storef;
6869               else if (mode == DFmode)
6870                 *cost += extra_cost->ldst.stored;
6871
6872               *cost +=
6873                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6874                                                      0, speed));
6875             }
6876
6877           *cost += rtx_cost (op1, mode, SET, 1, speed);
6878           return true;
6879
6880         case SUBREG:
6881           if (! REG_P (SUBREG_REG (op0)))
6882             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6883
6884           /* Fall through.  */
6885         case REG:
6886           /* The cost is one per vector-register copied.  */
6887           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6888             {
6889               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6890                               / GET_MODE_SIZE (V4SImode);
6891               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6892             }
6893           /* const0_rtx is in general free, but we will use an
6894              instruction to set a register to 0.  */
6895           else if (REG_P (op1) || op1 == const0_rtx)
6896             {
6897               /* The cost is 1 per register copied.  */
6898               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6899                               / UNITS_PER_WORD;
6900               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6901             }
6902           else
6903             /* Cost is just the cost of the RHS of the set.  */
6904             *cost += rtx_cost (op1, mode, SET, 1, speed);
6905           return true;
6906
6907         case ZERO_EXTRACT:
6908         case SIGN_EXTRACT:
6909           /* Bit-field insertion.  Strip any redundant widening of
6910              the RHS to meet the width of the target.  */
6911           if (GET_CODE (op1) == SUBREG)
6912             op1 = SUBREG_REG (op1);
6913           if ((GET_CODE (op1) == ZERO_EXTEND
6914                || GET_CODE (op1) == SIGN_EXTEND)
6915               && CONST_INT_P (XEXP (op0, 1))
6916               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6917                   >= INTVAL (XEXP (op0, 1))))
6918             op1 = XEXP (op1, 0);
6919
6920           if (CONST_INT_P (op1))
6921             {
6922               /* MOV immediate is assumed to always be cheap.  */
6923               *cost = COSTS_N_INSNS (1);
6924             }
6925           else
6926             {
6927               /* BFM.  */
6928               if (speed)
6929                 *cost += extra_cost->alu.bfi;
6930               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6931             }
6932
6933           return true;
6934
6935         default:
6936           /* We can't make sense of this, assume default cost.  */
6937           *cost = COSTS_N_INSNS (1);
6938           return false;
6939         }
6940       return false;
6941
6942     case CONST_INT:
6943       /* If an instruction can incorporate a constant within the
6944          instruction, the instruction's expression avoids calling
6945          rtx_cost() on the constant.  If rtx_cost() is called on a
6946          constant, then it is usually because the constant must be
6947          moved into a register by one or more instructions.
6948
6949          The exception is constant 0, which can be expressed
6950          as XZR/WZR and is therefore free.  The exception to this is
6951          if we have (set (reg) (const0_rtx)) in which case we must cost
6952          the move.  However, we can catch that when we cost the SET, so
6953          we don't need to consider that here.  */
6954       if (x == const0_rtx)
6955         *cost = 0;
6956       else
6957         {
6958           /* To an approximation, building any other constant is
6959              proportionally expensive to the number of instructions
6960              required to build that constant.  This is true whether we
6961              are compiling for SPEED or otherwise.  */
6962           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6963                                  (NULL_RTX, x, false, mode));
6964         }
6965       return true;
6966
6967     case CONST_DOUBLE:
6968
6969       /* First determine number of instructions to do the move
6970           as an integer constant.  */
6971       if (!aarch64_float_const_representable_p (x)
6972            && !aarch64_can_const_movi_rtx_p (x, mode)
6973            && aarch64_float_const_rtx_p (x))
6974         {
6975           unsigned HOST_WIDE_INT ival;
6976           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6977           gcc_assert (succeed);
6978
6979           machine_mode imode = mode == HFmode ? SImode
6980                                               : int_mode_for_mode (mode);
6981           int ncost = aarch64_internal_mov_immediate
6982                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6983           *cost += COSTS_N_INSNS (ncost);
6984           return true;
6985         }
6986
6987       if (speed)
6988         {
6989           /* mov[df,sf]_aarch64.  */
6990           if (aarch64_float_const_representable_p (x))
6991             /* FMOV (scalar immediate).  */
6992             *cost += extra_cost->fp[mode == DFmode].fpconst;
6993           else if (!aarch64_float_const_zero_rtx_p (x))
6994             {
6995               /* This will be a load from memory.  */
6996               if (mode == DFmode)
6997                 *cost += extra_cost->ldst.loadd;
6998               else
6999                 *cost += extra_cost->ldst.loadf;
7000             }
7001           else
7002             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7003                or MOV v0.s[0], wzr - neither of which are modeled by the
7004                cost tables.  Just use the default cost.  */
7005             {
7006             }
7007         }
7008
7009       return true;
7010
7011     case MEM:
7012       if (speed)
7013         {
7014           /* For loads we want the base cost of a load, plus an
7015              approximation for the additional cost of the addressing
7016              mode.  */
7017           rtx address = XEXP (x, 0);
7018           if (VECTOR_MODE_P (mode))
7019             *cost += extra_cost->ldst.loadv;
7020           else if (GET_MODE_CLASS (mode) == MODE_INT)
7021             *cost += extra_cost->ldst.load;
7022           else if (mode == SFmode)
7023             *cost += extra_cost->ldst.loadf;
7024           else if (mode == DFmode)
7025             *cost += extra_cost->ldst.loadd;
7026
7027           *cost +=
7028                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7029                                                      0, speed));
7030         }
7031
7032       return true;
7033
7034     case NEG:
7035       op0 = XEXP (x, 0);
7036
7037       if (VECTOR_MODE_P (mode))
7038         {
7039           if (speed)
7040             {
7041               /* FNEG.  */
7042               *cost += extra_cost->vect.alu;
7043             }
7044           return false;
7045         }
7046
7047       if (GET_MODE_CLASS (mode) == MODE_INT)
7048         {
7049           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7050               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7051             {
7052               /* CSETM.  */
7053               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7054               return true;
7055             }
7056
7057           /* Cost this as SUB wzr, X.  */
7058           op0 = CONST0_RTX (mode);
7059           op1 = XEXP (x, 0);
7060           goto cost_minus;
7061         }
7062
7063       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7064         {
7065           /* Support (neg(fma...)) as a single instruction only if
7066              sign of zeros is unimportant.  This matches the decision
7067              making in aarch64.md.  */
7068           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7069             {
7070               /* FNMADD.  */
7071               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7072               return true;
7073             }
7074           if (GET_CODE (op0) == MULT)
7075             {
7076               /* FNMUL.  */
7077               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7078               return true;
7079             }
7080           if (speed)
7081             /* FNEG.  */
7082             *cost += extra_cost->fp[mode == DFmode].neg;
7083           return false;
7084         }
7085
7086       return false;
7087
7088     case CLRSB:
7089     case CLZ:
7090       if (speed)
7091         {
7092           if (VECTOR_MODE_P (mode))
7093             *cost += extra_cost->vect.alu;
7094           else
7095             *cost += extra_cost->alu.clz;
7096         }
7097
7098       return false;
7099
7100     case COMPARE:
7101       op0 = XEXP (x, 0);
7102       op1 = XEXP (x, 1);
7103
7104       if (op1 == const0_rtx
7105           && GET_CODE (op0) == AND)
7106         {
7107           x = op0;
7108           mode = GET_MODE (op0);
7109           goto cost_logic;
7110         }
7111
7112       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7113         {
7114           /* TODO: A write to the CC flags possibly costs extra, this
7115              needs encoding in the cost tables.  */
7116
7117           mode = GET_MODE (op0);
7118           /* ANDS.  */
7119           if (GET_CODE (op0) == AND)
7120             {
7121               x = op0;
7122               goto cost_logic;
7123             }
7124
7125           if (GET_CODE (op0) == PLUS)
7126             {
7127               /* ADDS (and CMN alias).  */
7128               x = op0;
7129               goto cost_plus;
7130             }
7131
7132           if (GET_CODE (op0) == MINUS)
7133             {
7134               /* SUBS.  */
7135               x = op0;
7136               goto cost_minus;
7137             }
7138
7139           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7140               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7141               && CONST_INT_P (XEXP (op0, 2)))
7142             {
7143               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7144                  Handle it here directly rather than going to cost_logic
7145                  since we know the immediate generated for the TST is valid
7146                  so we can avoid creating an intermediate rtx for it only
7147                  for costing purposes.  */
7148               if (speed)
7149                 *cost += extra_cost->alu.logical;
7150
7151               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7152                                  ZERO_EXTRACT, 0, speed);
7153               return true;
7154             }
7155
7156           if (GET_CODE (op1) == NEG)
7157             {
7158               /* CMN.  */
7159               if (speed)
7160                 *cost += extra_cost->alu.arith;
7161
7162               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7163               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7164               return true;
7165             }
7166
7167           /* CMP.
7168
7169              Compare can freely swap the order of operands, and
7170              canonicalization puts the more complex operation first.
7171              But the integer MINUS logic expects the shift/extend
7172              operation in op1.  */
7173           if (! (REG_P (op0)
7174                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7175           {
7176             op0 = XEXP (x, 1);
7177             op1 = XEXP (x, 0);
7178           }
7179           goto cost_minus;
7180         }
7181
7182       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7183         {
7184           /* FCMP.  */
7185           if (speed)
7186             *cost += extra_cost->fp[mode == DFmode].compare;
7187
7188           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7189             {
7190               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7191               /* FCMP supports constant 0.0 for no extra cost. */
7192               return true;
7193             }
7194           return false;
7195         }
7196
7197       if (VECTOR_MODE_P (mode))
7198         {
7199           /* Vector compare.  */
7200           if (speed)
7201             *cost += extra_cost->vect.alu;
7202
7203           if (aarch64_float_const_zero_rtx_p (op1))
7204             {
7205               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7206                  cost.  */
7207               return true;
7208             }
7209           return false;
7210         }
7211       return false;
7212
7213     case MINUS:
7214       {
7215         op0 = XEXP (x, 0);
7216         op1 = XEXP (x, 1);
7217
7218 cost_minus:
7219         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7220
7221         /* Detect valid immediates.  */
7222         if ((GET_MODE_CLASS (mode) == MODE_INT
7223              || (GET_MODE_CLASS (mode) == MODE_CC
7224                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7225             && CONST_INT_P (op1)
7226             && aarch64_uimm12_shift (INTVAL (op1)))
7227           {
7228             if (speed)
7229               /* SUB(S) (immediate).  */
7230               *cost += extra_cost->alu.arith;
7231             return true;
7232           }
7233
7234         /* Look for SUB (extended register).  */
7235         if (aarch64_rtx_arith_op_extract_p (op1, mode))
7236           {
7237             if (speed)
7238               *cost += extra_cost->alu.extend_arith;
7239
7240             op1 = aarch64_strip_extend (op1, true);
7241             *cost += rtx_cost (op1, VOIDmode,
7242                                (enum rtx_code) GET_CODE (op1), 0, speed);
7243             return true;
7244           }
7245
7246         rtx new_op1 = aarch64_strip_extend (op1, false);
7247
7248         /* Cost this as an FMA-alike operation.  */
7249         if ((GET_CODE (new_op1) == MULT
7250              || aarch64_shift_p (GET_CODE (new_op1)))
7251             && code != COMPARE)
7252           {
7253             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7254                                             (enum rtx_code) code,
7255                                             speed);
7256             return true;
7257           }
7258
7259         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7260
7261         if (speed)
7262           {
7263             if (VECTOR_MODE_P (mode))
7264               {
7265                 /* Vector SUB.  */
7266                 *cost += extra_cost->vect.alu;
7267               }
7268             else if (GET_MODE_CLASS (mode) == MODE_INT)
7269               {
7270                 /* SUB(S).  */
7271                 *cost += extra_cost->alu.arith;
7272               }
7273             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7274               {
7275                 /* FSUB.  */
7276                 *cost += extra_cost->fp[mode == DFmode].addsub;
7277               }
7278           }
7279         return true;
7280       }
7281
7282     case PLUS:
7283       {
7284         rtx new_op0;
7285
7286         op0 = XEXP (x, 0);
7287         op1 = XEXP (x, 1);
7288
7289 cost_plus:
7290         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7291             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7292           {
7293             /* CSINC.  */
7294             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7295             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7296             return true;
7297           }
7298
7299         if (GET_MODE_CLASS (mode) == MODE_INT
7300             && CONST_INT_P (op1)
7301             && aarch64_uimm12_shift (INTVAL (op1)))
7302           {
7303             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7304
7305             if (speed)
7306               /* ADD (immediate).  */
7307               *cost += extra_cost->alu.arith;
7308             return true;
7309           }
7310
7311         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7312
7313         /* Look for ADD (extended register).  */
7314         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7315           {
7316             if (speed)
7317               *cost += extra_cost->alu.extend_arith;
7318
7319             op0 = aarch64_strip_extend (op0, true);
7320             *cost += rtx_cost (op0, VOIDmode,
7321                                (enum rtx_code) GET_CODE (op0), 0, speed);
7322             return true;
7323           }
7324
7325         /* Strip any extend, leave shifts behind as we will
7326            cost them through mult_cost.  */
7327         new_op0 = aarch64_strip_extend (op0, false);
7328
7329         if (GET_CODE (new_op0) == MULT
7330             || aarch64_shift_p (GET_CODE (new_op0)))
7331           {
7332             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7333                                             speed);
7334             return true;
7335           }
7336
7337         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7338
7339         if (speed)
7340           {
7341             if (VECTOR_MODE_P (mode))
7342               {
7343                 /* Vector ADD.  */
7344                 *cost += extra_cost->vect.alu;
7345               }
7346             else if (GET_MODE_CLASS (mode) == MODE_INT)
7347               {
7348                 /* ADD.  */
7349                 *cost += extra_cost->alu.arith;
7350               }
7351             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7352               {
7353                 /* FADD.  */
7354                 *cost += extra_cost->fp[mode == DFmode].addsub;
7355               }
7356           }
7357         return true;
7358       }
7359
7360     case BSWAP:
7361       *cost = COSTS_N_INSNS (1);
7362
7363       if (speed)
7364         {
7365           if (VECTOR_MODE_P (mode))
7366             *cost += extra_cost->vect.alu;
7367           else
7368             *cost += extra_cost->alu.rev;
7369         }
7370       return false;
7371
7372     case IOR:
7373       if (aarch_rev16_p (x))
7374         {
7375           *cost = COSTS_N_INSNS (1);
7376
7377           if (speed)
7378             {
7379               if (VECTOR_MODE_P (mode))
7380                 *cost += extra_cost->vect.alu;
7381               else
7382                 *cost += extra_cost->alu.rev;
7383             }
7384           return true;
7385         }
7386
7387       if (aarch64_extr_rtx_p (x, &op0, &op1))
7388         {
7389           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7390           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7391           if (speed)
7392             *cost += extra_cost->alu.shift;
7393
7394           return true;
7395         }
7396     /* Fall through.  */
7397     case XOR:
7398     case AND:
7399     cost_logic:
7400       op0 = XEXP (x, 0);
7401       op1 = XEXP (x, 1);
7402
7403       if (VECTOR_MODE_P (mode))
7404         {
7405           if (speed)
7406             *cost += extra_cost->vect.alu;
7407           return true;
7408         }
7409
7410       if (code == AND
7411           && GET_CODE (op0) == MULT
7412           && CONST_INT_P (XEXP (op0, 1))
7413           && CONST_INT_P (op1)
7414           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7415                                INTVAL (op1)) != 0)
7416         {
7417           /* This is a UBFM/SBFM.  */
7418           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7419           if (speed)
7420             *cost += extra_cost->alu.bfx;
7421           return true;
7422         }
7423
7424       if (GET_MODE_CLASS (mode) == MODE_INT)
7425         {
7426           if (CONST_INT_P (op1))
7427             {
7428               /* We have a mask + shift version of a UBFIZ
7429                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7430               if (GET_CODE (op0) == ASHIFT
7431                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7432                                                           XEXP (op0, 1)))
7433                 {
7434                   *cost += rtx_cost (XEXP (op0, 0), mode,
7435                                      (enum rtx_code) code, 0, speed);
7436                   if (speed)
7437                     *cost += extra_cost->alu.bfx;
7438
7439                   return true;
7440                 }
7441               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7442                 {
7443                 /* We possibly get the immediate for free, this is not
7444                    modelled.  */
7445                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7446                   if (speed)
7447                     *cost += extra_cost->alu.logical;
7448
7449                   return true;
7450                 }
7451             }
7452           else
7453             {
7454               rtx new_op0 = op0;
7455
7456               /* Handle ORN, EON, or BIC.  */
7457               if (GET_CODE (op0) == NOT)
7458                 op0 = XEXP (op0, 0);
7459
7460               new_op0 = aarch64_strip_shift (op0);
7461
7462               /* If we had a shift on op0 then this is a logical-shift-
7463                  by-register/immediate operation.  Otherwise, this is just
7464                  a logical operation.  */
7465               if (speed)
7466                 {
7467                   if (new_op0 != op0)
7468                     {
7469                       /* Shift by immediate.  */
7470                       if (CONST_INT_P (XEXP (op0, 1)))
7471                         *cost += extra_cost->alu.log_shift;
7472                       else
7473                         *cost += extra_cost->alu.log_shift_reg;
7474                     }
7475                   else
7476                     *cost += extra_cost->alu.logical;
7477                 }
7478
7479               /* In both cases we want to cost both operands.  */
7480               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7481               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7482
7483               return true;
7484             }
7485         }
7486       return false;
7487
7488     case NOT:
7489       x = XEXP (x, 0);
7490       op0 = aarch64_strip_shift (x);
7491
7492       if (VECTOR_MODE_P (mode))
7493         {
7494           /* Vector NOT.  */
7495           *cost += extra_cost->vect.alu;
7496           return false;
7497         }
7498
7499       /* MVN-shifted-reg.  */
7500       if (op0 != x)
7501         {
7502           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7503
7504           if (speed)
7505             *cost += extra_cost->alu.log_shift;
7506
7507           return true;
7508         }
7509       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7510          Handle the second form here taking care that 'a' in the above can
7511          be a shift.  */
7512       else if (GET_CODE (op0) == XOR)
7513         {
7514           rtx newop0 = XEXP (op0, 0);
7515           rtx newop1 = XEXP (op0, 1);
7516           rtx op0_stripped = aarch64_strip_shift (newop0);
7517
7518           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7519           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7520
7521           if (speed)
7522             {
7523               if (op0_stripped != newop0)
7524                 *cost += extra_cost->alu.log_shift;
7525               else
7526                 *cost += extra_cost->alu.logical;
7527             }
7528
7529           return true;
7530         }
7531       /* MVN.  */
7532       if (speed)
7533         *cost += extra_cost->alu.logical;
7534
7535       return false;
7536
7537     case ZERO_EXTEND:
7538
7539       op0 = XEXP (x, 0);
7540       /* If a value is written in SI mode, then zero extended to DI
7541          mode, the operation will in general be free as a write to
7542          a 'w' register implicitly zeroes the upper bits of an 'x'
7543          register.  However, if this is
7544
7545            (set (reg) (zero_extend (reg)))
7546
7547          we must cost the explicit register move.  */
7548       if (mode == DImode
7549           && GET_MODE (op0) == SImode
7550           && outer == SET)
7551         {
7552           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7553
7554         /* If OP_COST is non-zero, then the cost of the zero extend
7555            is effectively the cost of the inner operation.  Otherwise
7556            we have a MOV instruction and we take the cost from the MOV
7557            itself.  This is true independently of whether we are
7558            optimizing for space or time.  */
7559           if (op_cost)
7560             *cost = op_cost;
7561
7562           return true;
7563         }
7564       else if (MEM_P (op0))
7565         {
7566           /* All loads can zero extend to any size for free.  */
7567           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7568           return true;
7569         }
7570
7571       op0 = aarch64_extend_bitfield_pattern_p (x);
7572       if (op0)
7573         {
7574           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7575           if (speed)
7576             *cost += extra_cost->alu.bfx;
7577           return true;
7578         }
7579
7580       if (speed)
7581         {
7582           if (VECTOR_MODE_P (mode))
7583             {
7584               /* UMOV.  */
7585               *cost += extra_cost->vect.alu;
7586             }
7587           else
7588             {
7589               /* We generate an AND instead of UXTB/UXTH.  */
7590               *cost += extra_cost->alu.logical;
7591             }
7592         }
7593       return false;
7594
7595     case SIGN_EXTEND:
7596       if (MEM_P (XEXP (x, 0)))
7597         {
7598           /* LDRSH.  */
7599           if (speed)
7600             {
7601               rtx address = XEXP (XEXP (x, 0), 0);
7602               *cost += extra_cost->ldst.load_sign_extend;
7603
7604               *cost +=
7605                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7606                                                      0, speed));
7607             }
7608           return true;
7609         }
7610
7611       op0 = aarch64_extend_bitfield_pattern_p (x);
7612       if (op0)
7613         {
7614           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7615           if (speed)
7616             *cost += extra_cost->alu.bfx;
7617           return true;
7618         }
7619
7620       if (speed)
7621         {
7622           if (VECTOR_MODE_P (mode))
7623             *cost += extra_cost->vect.alu;
7624           else
7625             *cost += extra_cost->alu.extend;
7626         }
7627       return false;
7628
7629     case ASHIFT:
7630       op0 = XEXP (x, 0);
7631       op1 = XEXP (x, 1);
7632
7633       if (CONST_INT_P (op1))
7634         {
7635           if (speed)
7636             {
7637               if (VECTOR_MODE_P (mode))
7638                 {
7639                   /* Vector shift (immediate).  */
7640                   *cost += extra_cost->vect.alu;
7641                 }
7642               else
7643                 {
7644                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7645                      aliases.  */
7646                   *cost += extra_cost->alu.shift;
7647                 }
7648             }
7649
7650           /* We can incorporate zero/sign extend for free.  */
7651           if (GET_CODE (op0) == ZERO_EXTEND
7652               || GET_CODE (op0) == SIGN_EXTEND)
7653             op0 = XEXP (op0, 0);
7654
7655           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7656           return true;
7657         }
7658       else
7659         {
7660           if (VECTOR_MODE_P (mode))
7661             {
7662               if (speed)
7663                 /* Vector shift (register).  */
7664                 *cost += extra_cost->vect.alu;
7665             }
7666           else
7667             {
7668               if (speed)
7669                 /* LSLV.  */
7670                 *cost += extra_cost->alu.shift_reg;
7671
7672               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7673                   && CONST_INT_P (XEXP (op1, 1))
7674                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7675                 {
7676                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7677                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7678                      don't recurse into it.  */
7679                   return true;
7680                 }
7681             }
7682           return false;  /* All arguments need to be in registers.  */
7683         }
7684
7685     case ROTATE:
7686     case ROTATERT:
7687     case LSHIFTRT:
7688     case ASHIFTRT:
7689       op0 = XEXP (x, 0);
7690       op1 = XEXP (x, 1);
7691
7692       if (CONST_INT_P (op1))
7693         {
7694           /* ASR (immediate) and friends.  */
7695           if (speed)
7696             {
7697               if (VECTOR_MODE_P (mode))
7698                 *cost += extra_cost->vect.alu;
7699               else
7700                 *cost += extra_cost->alu.shift;
7701             }
7702
7703           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7704           return true;
7705         }
7706       else
7707         {
7708           if (VECTOR_MODE_P (mode))
7709             {
7710               if (speed)
7711                 /* Vector shift (register).  */
7712                 *cost += extra_cost->vect.alu;
7713             }
7714           else
7715             {
7716               if (speed)
7717                 /* ASR (register) and friends.  */
7718                 *cost += extra_cost->alu.shift_reg;
7719
7720               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7721                   && CONST_INT_P (XEXP (op1, 1))
7722                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7723                 {
7724                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7725                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7726                      don't recurse into it.  */
7727                   return true;
7728                 }
7729             }
7730           return false;  /* All arguments need to be in registers.  */
7731         }
7732
7733     case SYMBOL_REF:
7734
7735       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7736           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7737         {
7738           /* LDR.  */
7739           if (speed)
7740             *cost += extra_cost->ldst.load;
7741         }
7742       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7743                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7744         {
7745           /* ADRP, followed by ADD.  */
7746           *cost += COSTS_N_INSNS (1);
7747           if (speed)
7748             *cost += 2 * extra_cost->alu.arith;
7749         }
7750       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7751                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7752         {
7753           /* ADR.  */
7754           if (speed)
7755             *cost += extra_cost->alu.arith;
7756         }
7757
7758       if (flag_pic)
7759         {
7760           /* One extra load instruction, after accessing the GOT.  */
7761           *cost += COSTS_N_INSNS (1);
7762           if (speed)
7763             *cost += extra_cost->ldst.load;
7764         }
7765       return true;
7766
7767     case HIGH:
7768     case LO_SUM:
7769       /* ADRP/ADD (immediate).  */
7770       if (speed)
7771         *cost += extra_cost->alu.arith;
7772       return true;
7773
7774     case ZERO_EXTRACT:
7775     case SIGN_EXTRACT:
7776       /* UBFX/SBFX.  */
7777       if (speed)
7778         {
7779           if (VECTOR_MODE_P (mode))
7780             *cost += extra_cost->vect.alu;
7781           else
7782             *cost += extra_cost->alu.bfx;
7783         }
7784
7785       /* We can trust that the immediates used will be correct (there
7786          are no by-register forms), so we need only cost op0.  */
7787       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7788       return true;
7789
7790     case MULT:
7791       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7792       /* aarch64_rtx_mult_cost always handles recursion to its
7793          operands.  */
7794       return true;
7795
7796     case MOD:
7797     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7798        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7799        an unconditional negate.  This case should only ever be reached through
7800        the set_smod_pow2_cheap check in expmed.c.  */
7801       if (CONST_INT_P (XEXP (x, 1))
7802           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7803           && (mode == SImode || mode == DImode))
7804         {
7805           /* We expand to 4 instructions.  Reset the baseline.  */
7806           *cost = COSTS_N_INSNS (4);
7807
7808           if (speed)
7809             *cost += 2 * extra_cost->alu.logical
7810                      + 2 * extra_cost->alu.arith;
7811
7812           return true;
7813         }
7814
7815     /* Fall-through.  */
7816     case UMOD:
7817       if (speed)
7818         {
7819           /* Slighly prefer UMOD over SMOD.  */
7820           if (VECTOR_MODE_P (mode))
7821             *cost += extra_cost->vect.alu;
7822           else if (GET_MODE_CLASS (mode) == MODE_INT)
7823             *cost += (extra_cost->mult[mode == DImode].add
7824                       + extra_cost->mult[mode == DImode].idiv
7825                       + (code == MOD ? 1 : 0));
7826         }
7827       return false;  /* All arguments need to be in registers.  */
7828
7829     case DIV:
7830     case UDIV:
7831     case SQRT:
7832       if (speed)
7833         {
7834           if (VECTOR_MODE_P (mode))
7835             *cost += extra_cost->vect.alu;
7836           else if (GET_MODE_CLASS (mode) == MODE_INT)
7837             /* There is no integer SQRT, so only DIV and UDIV can get
7838                here.  */
7839             *cost += (extra_cost->mult[mode == DImode].idiv
7840                      /* Slighly prefer UDIV over SDIV.  */
7841                      + (code == DIV ? 1 : 0));
7842           else
7843             *cost += extra_cost->fp[mode == DFmode].div;
7844         }
7845       return false;  /* All arguments need to be in registers.  */
7846
7847     case IF_THEN_ELSE:
7848       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7849                                          XEXP (x, 2), cost, speed);
7850
7851     case EQ:
7852     case NE:
7853     case GT:
7854     case GTU:
7855     case LT:
7856     case LTU:
7857     case GE:
7858     case GEU:
7859     case LE:
7860     case LEU:
7861
7862       return false; /* All arguments must be in registers.  */
7863
7864     case FMA:
7865       op0 = XEXP (x, 0);
7866       op1 = XEXP (x, 1);
7867       op2 = XEXP (x, 2);
7868
7869       if (speed)
7870         {
7871           if (VECTOR_MODE_P (mode))
7872             *cost += extra_cost->vect.alu;
7873           else
7874             *cost += extra_cost->fp[mode == DFmode].fma;
7875         }
7876
7877       /* FMSUB, FNMADD, and FNMSUB are free.  */
7878       if (GET_CODE (op0) == NEG)
7879         op0 = XEXP (op0, 0);
7880
7881       if (GET_CODE (op2) == NEG)
7882         op2 = XEXP (op2, 0);
7883
7884       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7885          and the by-element operand as operand 0.  */
7886       if (GET_CODE (op1) == NEG)
7887         op1 = XEXP (op1, 0);
7888
7889       /* Catch vector-by-element operations.  The by-element operand can
7890          either be (vec_duplicate (vec_select (x))) or just
7891          (vec_select (x)), depending on whether we are multiplying by
7892          a vector or a scalar.
7893
7894          Canonicalization is not very good in these cases, FMA4 will put the
7895          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7896       if (GET_CODE (op0) == VEC_DUPLICATE)
7897         op0 = XEXP (op0, 0);
7898       else if (GET_CODE (op1) == VEC_DUPLICATE)
7899         op1 = XEXP (op1, 0);
7900
7901       if (GET_CODE (op0) == VEC_SELECT)
7902         op0 = XEXP (op0, 0);
7903       else if (GET_CODE (op1) == VEC_SELECT)
7904         op1 = XEXP (op1, 0);
7905
7906       /* If the remaining parameters are not registers,
7907          get the cost to put them into registers.  */
7908       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7909       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7910       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7911       return true;
7912
7913     case FLOAT:
7914     case UNSIGNED_FLOAT:
7915       if (speed)
7916         *cost += extra_cost->fp[mode == DFmode].fromint;
7917       return false;
7918
7919     case FLOAT_EXTEND:
7920       if (speed)
7921         {
7922           if (VECTOR_MODE_P (mode))
7923             {
7924               /*Vector truncate.  */
7925               *cost += extra_cost->vect.alu;
7926             }
7927           else
7928             *cost += extra_cost->fp[mode == DFmode].widen;
7929         }
7930       return false;
7931
7932     case FLOAT_TRUNCATE:
7933       if (speed)
7934         {
7935           if (VECTOR_MODE_P (mode))
7936             {
7937               /*Vector conversion.  */
7938               *cost += extra_cost->vect.alu;
7939             }
7940           else
7941             *cost += extra_cost->fp[mode == DFmode].narrow;
7942         }
7943       return false;
7944
7945     case FIX:
7946     case UNSIGNED_FIX:
7947       x = XEXP (x, 0);
7948       /* Strip the rounding part.  They will all be implemented
7949          by the fcvt* family of instructions anyway.  */
7950       if (GET_CODE (x) == UNSPEC)
7951         {
7952           unsigned int uns_code = XINT (x, 1);
7953
7954           if (uns_code == UNSPEC_FRINTA
7955               || uns_code == UNSPEC_FRINTM
7956               || uns_code == UNSPEC_FRINTN
7957               || uns_code == UNSPEC_FRINTP
7958               || uns_code == UNSPEC_FRINTZ)
7959             x = XVECEXP (x, 0, 0);
7960         }
7961
7962       if (speed)
7963         {
7964           if (VECTOR_MODE_P (mode))
7965             *cost += extra_cost->vect.alu;
7966           else
7967             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7968         }
7969
7970       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7971          fixed-point fcvt.  */
7972       if (GET_CODE (x) == MULT
7973           && ((VECTOR_MODE_P (mode)
7974                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7975               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7976         {
7977           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7978                              0, speed);
7979           return true;
7980         }
7981
7982       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7983       return true;
7984
7985     case ABS:
7986       if (VECTOR_MODE_P (mode))
7987         {
7988           /* ABS (vector).  */
7989           if (speed)
7990             *cost += extra_cost->vect.alu;
7991         }
7992       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7993         {
7994           op0 = XEXP (x, 0);
7995
7996           /* FABD, which is analogous to FADD.  */
7997           if (GET_CODE (op0) == MINUS)
7998             {
7999               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8000               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8001               if (speed)
8002                 *cost += extra_cost->fp[mode == DFmode].addsub;
8003
8004               return true;
8005             }
8006           /* Simple FABS is analogous to FNEG.  */
8007           if (speed)
8008             *cost += extra_cost->fp[mode == DFmode].neg;
8009         }
8010       else
8011         {
8012           /* Integer ABS will either be split to
8013              two arithmetic instructions, or will be an ABS
8014              (scalar), which we don't model.  */
8015           *cost = COSTS_N_INSNS (2);
8016           if (speed)
8017             *cost += 2 * extra_cost->alu.arith;
8018         }
8019       return false;
8020
8021     case SMAX:
8022     case SMIN:
8023       if (speed)
8024         {
8025           if (VECTOR_MODE_P (mode))
8026             *cost += extra_cost->vect.alu;
8027           else
8028             {
8029               /* FMAXNM/FMINNM/FMAX/FMIN.
8030                  TODO: This may not be accurate for all implementations, but
8031                  we do not model this in the cost tables.  */
8032               *cost += extra_cost->fp[mode == DFmode].addsub;
8033             }
8034         }
8035       return false;
8036
8037     case UNSPEC:
8038       /* The floating point round to integer frint* instructions.  */
8039       if (aarch64_frint_unspec_p (XINT (x, 1)))
8040         {
8041           if (speed)
8042             *cost += extra_cost->fp[mode == DFmode].roundint;
8043
8044           return false;
8045         }
8046
8047       if (XINT (x, 1) == UNSPEC_RBIT)
8048         {
8049           if (speed)
8050             *cost += extra_cost->alu.rev;
8051
8052           return false;
8053         }
8054       break;
8055
8056     case TRUNCATE:
8057
8058       /* Decompose <su>muldi3_highpart.  */
8059       if (/* (truncate:DI  */
8060           mode == DImode
8061           /*   (lshiftrt:TI  */
8062           && GET_MODE (XEXP (x, 0)) == TImode
8063           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8064           /*      (mult:TI  */
8065           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8066           /*        (ANY_EXTEND:TI (reg:DI))
8067                     (ANY_EXTEND:TI (reg:DI)))  */
8068           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8069                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8070               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8071                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8072           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8073           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8074           /*     (const_int 64)  */
8075           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8076           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8077         {
8078           /* UMULH/SMULH.  */
8079           if (speed)
8080             *cost += extra_cost->mult[mode == DImode].extend;
8081           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8082                              mode, MULT, 0, speed);
8083           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8084                              mode, MULT, 1, speed);
8085           return true;
8086         }
8087
8088       /* Fall through.  */
8089     default:
8090       break;
8091     }
8092
8093   if (dump_file
8094       && flag_aarch64_verbose_cost)
8095     fprintf (dump_file,
8096       "\nFailed to cost RTX.  Assuming default cost.\n");
8097
8098   return true;
8099 }
8100
8101 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8102    calculated for X.  This cost is stored in *COST.  Returns true
8103    if the total cost of X was calculated.  */
8104 static bool
8105 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8106                    int param, int *cost, bool speed)
8107 {
8108   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8109
8110   if (dump_file
8111       && flag_aarch64_verbose_cost)
8112     {
8113       print_rtl_single (dump_file, x);
8114       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8115                speed ? "Hot" : "Cold",
8116                *cost, result ? "final" : "partial");
8117     }
8118
8119   return result;
8120 }
8121
8122 static int
8123 aarch64_register_move_cost (machine_mode mode,
8124                             reg_class_t from_i, reg_class_t to_i)
8125 {
8126   enum reg_class from = (enum reg_class) from_i;
8127   enum reg_class to = (enum reg_class) to_i;
8128   const struct cpu_regmove_cost *regmove_cost
8129     = aarch64_tune_params.regmove_cost;
8130
8131   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8132   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8133     to = GENERAL_REGS;
8134
8135   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8136     from = GENERAL_REGS;
8137
8138   /* Moving between GPR and stack cost is the same as GP2GP.  */
8139   if ((from == GENERAL_REGS && to == STACK_REG)
8140       || (to == GENERAL_REGS && from == STACK_REG))
8141     return regmove_cost->GP2GP;
8142
8143   /* To/From the stack register, we move via the gprs.  */
8144   if (to == STACK_REG || from == STACK_REG)
8145     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8146             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8147
8148   if (GET_MODE_SIZE (mode) == 16)
8149     {
8150       /* 128-bit operations on general registers require 2 instructions.  */
8151       if (from == GENERAL_REGS && to == GENERAL_REGS)
8152         return regmove_cost->GP2GP * 2;
8153       else if (from == GENERAL_REGS)
8154         return regmove_cost->GP2FP * 2;
8155       else if (to == GENERAL_REGS)
8156         return regmove_cost->FP2GP * 2;
8157
8158       /* When AdvSIMD instructions are disabled it is not possible to move
8159          a 128-bit value directly between Q registers.  This is handled in
8160          secondary reload.  A general register is used as a scratch to move
8161          the upper DI value and the lower DI value is moved directly,
8162          hence the cost is the sum of three moves. */
8163       if (! TARGET_SIMD)
8164         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8165
8166       return regmove_cost->FP2FP;
8167     }
8168
8169   if (from == GENERAL_REGS && to == GENERAL_REGS)
8170     return regmove_cost->GP2GP;
8171   else if (from == GENERAL_REGS)
8172     return regmove_cost->GP2FP;
8173   else if (to == GENERAL_REGS)
8174     return regmove_cost->FP2GP;
8175
8176   return regmove_cost->FP2FP;
8177 }
8178
8179 static int
8180 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8181                           reg_class_t rclass ATTRIBUTE_UNUSED,
8182                           bool in ATTRIBUTE_UNUSED)
8183 {
8184   return aarch64_tune_params.memmov_cost;
8185 }
8186
8187 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8188    to optimize 1.0/sqrt.  */
8189
8190 static bool
8191 use_rsqrt_p (machine_mode mode)
8192 {
8193   return (!flag_trapping_math
8194           && flag_unsafe_math_optimizations
8195           && ((aarch64_tune_params.approx_modes->recip_sqrt
8196                & AARCH64_APPROX_MODE (mode))
8197               || flag_mrecip_low_precision_sqrt));
8198 }
8199
8200 /* Function to decide when to use the approximate reciprocal square root
8201    builtin.  */
8202
8203 static tree
8204 aarch64_builtin_reciprocal (tree fndecl)
8205 {
8206   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8207
8208   if (!use_rsqrt_p (mode))
8209     return NULL_TREE;
8210   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8211 }
8212
8213 typedef rtx (*rsqrte_type) (rtx, rtx);
8214
8215 /* Select reciprocal square root initial estimate insn depending on machine
8216    mode.  */
8217
8218 static rsqrte_type
8219 get_rsqrte_type (machine_mode mode)
8220 {
8221   switch (mode)
8222   {
8223     case DFmode:   return gen_aarch64_rsqrtedf;
8224     case SFmode:   return gen_aarch64_rsqrtesf;
8225     case V2DFmode: return gen_aarch64_rsqrtev2df;
8226     case V2SFmode: return gen_aarch64_rsqrtev2sf;
8227     case V4SFmode: return gen_aarch64_rsqrtev4sf;
8228     default: gcc_unreachable ();
8229   }
8230 }
8231
8232 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8233
8234 /* Select reciprocal square root series step insn depending on machine mode.  */
8235
8236 static rsqrts_type
8237 get_rsqrts_type (machine_mode mode)
8238 {
8239   switch (mode)
8240   {
8241     case DFmode:   return gen_aarch64_rsqrtsdf;
8242     case SFmode:   return gen_aarch64_rsqrtssf;
8243     case V2DFmode: return gen_aarch64_rsqrtsv2df;
8244     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
8245     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
8246     default: gcc_unreachable ();
8247   }
8248 }
8249
8250 /* Emit instruction sequence to compute either the approximate square root
8251    or its approximate reciprocal, depending on the flag RECP, and return
8252    whether the sequence was emitted or not.  */
8253
8254 bool
8255 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8256 {
8257   machine_mode mode = GET_MODE (dst);
8258
8259   if (GET_MODE_INNER (mode) == HFmode)
8260     {
8261       gcc_assert (!recp);
8262       return false;
8263     }
8264
8265   machine_mode mmsk
8266     = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
8267                        GET_MODE_NUNITS (mode));
8268   if (!recp)
8269     {
8270       if (!(flag_mlow_precision_sqrt
8271             || (aarch64_tune_params.approx_modes->sqrt
8272                 & AARCH64_APPROX_MODE (mode))))
8273         return false;
8274
8275       if (flag_finite_math_only
8276           || flag_trapping_math
8277           || !flag_unsafe_math_optimizations
8278           || optimize_function_for_size_p (cfun))
8279         return false;
8280     }
8281   else
8282     /* Caller assumes we cannot fail.  */
8283     gcc_assert (use_rsqrt_p (mode));
8284
8285
8286   rtx xmsk = gen_reg_rtx (mmsk);
8287   if (!recp)
8288     /* When calculating the approximate square root, compare the
8289        argument with 0.0 and create a mask.  */
8290     emit_insn (gen_rtx_SET (xmsk,
8291                             gen_rtx_NEG (mmsk,
8292                                          gen_rtx_EQ (mmsk, src,
8293                                                      CONST0_RTX (mode)))));
8294
8295   /* Estimate the approximate reciprocal square root.  */
8296   rtx xdst = gen_reg_rtx (mode);
8297   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8298
8299   /* Iterate over the series twice for SF and thrice for DF.  */
8300   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8301
8302   /* Optionally iterate over the series once less for faster performance
8303      while sacrificing the accuracy.  */
8304   if ((recp && flag_mrecip_low_precision_sqrt)
8305       || (!recp && flag_mlow_precision_sqrt))
8306     iterations--;
8307
8308   /* Iterate over the series to calculate the approximate reciprocal square
8309      root.  */
8310   rtx x1 = gen_reg_rtx (mode);
8311   while (iterations--)
8312     {
8313       rtx x2 = gen_reg_rtx (mode);
8314       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8315
8316       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8317
8318       if (iterations > 0)
8319         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8320     }
8321
8322   if (!recp)
8323     {
8324       /* Qualify the approximate reciprocal square root when the argument is
8325          0.0 by squashing the intermediary result to 0.0.  */
8326       rtx xtmp = gen_reg_rtx (mmsk);
8327       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8328                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8329       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8330
8331       /* Calculate the approximate square root.  */
8332       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8333     }
8334
8335   /* Finalize the approximation.  */
8336   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8337
8338   return true;
8339 }
8340
8341 typedef rtx (*recpe_type) (rtx, rtx);
8342
8343 /* Select reciprocal initial estimate insn depending on machine mode.  */
8344
8345 static recpe_type
8346 get_recpe_type (machine_mode mode)
8347 {
8348   switch (mode)
8349   {
8350     case SFmode:   return (gen_aarch64_frecpesf);
8351     case V2SFmode: return (gen_aarch64_frecpev2sf);
8352     case V4SFmode: return (gen_aarch64_frecpev4sf);
8353     case DFmode:   return (gen_aarch64_frecpedf);
8354     case V2DFmode: return (gen_aarch64_frecpev2df);
8355     default:       gcc_unreachable ();
8356   }
8357 }
8358
8359 typedef rtx (*recps_type) (rtx, rtx, rtx);
8360
8361 /* Select reciprocal series step insn depending on machine mode.  */
8362
8363 static recps_type
8364 get_recps_type (machine_mode mode)
8365 {
8366   switch (mode)
8367   {
8368     case SFmode:   return (gen_aarch64_frecpssf);
8369     case V2SFmode: return (gen_aarch64_frecpsv2sf);
8370     case V4SFmode: return (gen_aarch64_frecpsv4sf);
8371     case DFmode:   return (gen_aarch64_frecpsdf);
8372     case V2DFmode: return (gen_aarch64_frecpsv2df);
8373     default:       gcc_unreachable ();
8374   }
8375 }
8376
8377 /* Emit the instruction sequence to compute the approximation for the division
8378    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8379
8380 bool
8381 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8382 {
8383   machine_mode mode = GET_MODE (quo);
8384
8385   if (GET_MODE_INNER (mode) == HFmode)
8386     return false;
8387
8388   bool use_approx_division_p = (flag_mlow_precision_div
8389                                 || (aarch64_tune_params.approx_modes->division
8390                                     & AARCH64_APPROX_MODE (mode)));
8391
8392   if (!flag_finite_math_only
8393       || flag_trapping_math
8394       || !flag_unsafe_math_optimizations
8395       || optimize_function_for_size_p (cfun)
8396       || !use_approx_division_p)
8397     return false;
8398
8399   /* Estimate the approximate reciprocal.  */
8400   rtx xrcp = gen_reg_rtx (mode);
8401   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8402
8403   /* Iterate over the series twice for SF and thrice for DF.  */
8404   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8405
8406   /* Optionally iterate over the series once less for faster performance,
8407      while sacrificing the accuracy.  */
8408   if (flag_mlow_precision_div)
8409     iterations--;
8410
8411   /* Iterate over the series to calculate the approximate reciprocal.  */
8412   rtx xtmp = gen_reg_rtx (mode);
8413   while (iterations--)
8414     {
8415       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8416
8417       if (iterations > 0)
8418         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8419     }
8420
8421   if (num != CONST1_RTX (mode))
8422     {
8423       /* As the approximate reciprocal of DEN is already calculated, only
8424          calculate the approximate division when NUM is not 1.0.  */
8425       rtx xnum = force_reg (mode, num);
8426       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8427     }
8428
8429   /* Finalize the approximation.  */
8430   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8431   return true;
8432 }
8433
8434 /* Return the number of instructions that can be issued per cycle.  */
8435 static int
8436 aarch64_sched_issue_rate (void)
8437 {
8438   return aarch64_tune_params.issue_rate;
8439 }
8440
8441 static int
8442 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8443 {
8444   int issue_rate = aarch64_sched_issue_rate ();
8445
8446   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8447 }
8448
8449
8450 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8451    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8452    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8453
8454 static int
8455 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8456                                                     int ready_index)
8457 {
8458   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8459 }
8460
8461
8462 /* Vectorizer cost model target hooks.  */
8463
8464 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8465 static int
8466 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8467                                     tree vectype,
8468                                     int misalign ATTRIBUTE_UNUSED)
8469 {
8470   unsigned elements;
8471   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8472   bool fp = false;
8473
8474   if (vectype != NULL)
8475     fp = FLOAT_TYPE_P (vectype);
8476
8477   switch (type_of_cost)
8478     {
8479       case scalar_stmt:
8480         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8481
8482       case scalar_load:
8483         return costs->scalar_load_cost;
8484
8485       case scalar_store:
8486         return costs->scalar_store_cost;
8487
8488       case vector_stmt:
8489         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8490
8491       case vector_load:
8492         return costs->vec_align_load_cost;
8493
8494       case vector_store:
8495         return costs->vec_store_cost;
8496
8497       case vec_to_scalar:
8498         return costs->vec_to_scalar_cost;
8499
8500       case scalar_to_vec:
8501         return costs->scalar_to_vec_cost;
8502
8503       case unaligned_load:
8504         return costs->vec_unalign_load_cost;
8505
8506       case unaligned_store:
8507         return costs->vec_unalign_store_cost;
8508
8509       case cond_branch_taken:
8510         return costs->cond_taken_branch_cost;
8511
8512       case cond_branch_not_taken:
8513         return costs->cond_not_taken_branch_cost;
8514
8515       case vec_perm:
8516         return costs->vec_permute_cost;
8517
8518       case vec_promote_demote:
8519         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8520
8521       case vec_construct:
8522         elements = TYPE_VECTOR_SUBPARTS (vectype);
8523         return elements / 2 + 1;
8524
8525       default:
8526         gcc_unreachable ();
8527     }
8528 }
8529
8530 /* Implement targetm.vectorize.add_stmt_cost.  */
8531 static unsigned
8532 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8533                        struct _stmt_vec_info *stmt_info, int misalign,
8534                        enum vect_cost_model_location where)
8535 {
8536   unsigned *cost = (unsigned *) data;
8537   unsigned retval = 0;
8538
8539   if (flag_vect_cost_model)
8540     {
8541       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8542       int stmt_cost =
8543             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8544
8545       /* Statements in an inner loop relative to the loop being
8546          vectorized are weighted more heavily.  The value here is
8547          arbitrary and could potentially be improved with analysis.  */
8548       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8549         count *= 50; /*  FIXME  */
8550
8551       retval = (unsigned) (count * stmt_cost);
8552       cost[where] += retval;
8553     }
8554
8555   return retval;
8556 }
8557
8558 static void initialize_aarch64_code_model (struct gcc_options *);
8559
8560 /* Parse the TO_PARSE string and put the architecture struct that it
8561    selects into RES and the architectural features into ISA_FLAGS.
8562    Return an aarch64_parse_opt_result describing the parse result.
8563    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8564
8565 static enum aarch64_parse_opt_result
8566 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8567                     unsigned long *isa_flags)
8568 {
8569   char *ext;
8570   const struct processor *arch;
8571   char *str = (char *) alloca (strlen (to_parse) + 1);
8572   size_t len;
8573
8574   strcpy (str, to_parse);
8575
8576   ext = strchr (str, '+');
8577
8578   if (ext != NULL)
8579     len = ext - str;
8580   else
8581     len = strlen (str);
8582
8583   if (len == 0)
8584     return AARCH64_PARSE_MISSING_ARG;
8585
8586
8587   /* Loop through the list of supported ARCHes to find a match.  */
8588   for (arch = all_architectures; arch->name != NULL; arch++)
8589     {
8590       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8591         {
8592           unsigned long isa_temp = arch->flags;
8593
8594           if (ext != NULL)
8595             {
8596               /* TO_PARSE string contains at least one extension.  */
8597               enum aarch64_parse_opt_result ext_res
8598                 = aarch64_parse_extension (ext, &isa_temp);
8599
8600               if (ext_res != AARCH64_PARSE_OK)
8601                 return ext_res;
8602             }
8603           /* Extension parsing was successful.  Confirm the result
8604              arch and ISA flags.  */
8605           *res = arch;
8606           *isa_flags = isa_temp;
8607           return AARCH64_PARSE_OK;
8608         }
8609     }
8610
8611   /* ARCH name not found in list.  */
8612   return AARCH64_PARSE_INVALID_ARG;
8613 }
8614
8615 /* Parse the TO_PARSE string and put the result tuning in RES and the
8616    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8617    describing the parse result.  If there is an error parsing, RES and
8618    ISA_FLAGS are left unchanged.  */
8619
8620 static enum aarch64_parse_opt_result
8621 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8622                    unsigned long *isa_flags)
8623 {
8624   char *ext;
8625   const struct processor *cpu;
8626   char *str = (char *) alloca (strlen (to_parse) + 1);
8627   size_t len;
8628
8629   strcpy (str, to_parse);
8630
8631   ext = strchr (str, '+');
8632
8633   if (ext != NULL)
8634     len = ext - str;
8635   else
8636     len = strlen (str);
8637
8638   if (len == 0)
8639     return AARCH64_PARSE_MISSING_ARG;
8640
8641
8642   /* Loop through the list of supported CPUs to find a match.  */
8643   for (cpu = all_cores; cpu->name != NULL; cpu++)
8644     {
8645       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8646         {
8647           unsigned long isa_temp = cpu->flags;
8648
8649
8650           if (ext != NULL)
8651             {
8652               /* TO_PARSE string contains at least one extension.  */
8653               enum aarch64_parse_opt_result ext_res
8654                 = aarch64_parse_extension (ext, &isa_temp);
8655
8656               if (ext_res != AARCH64_PARSE_OK)
8657                 return ext_res;
8658             }
8659           /* Extension parsing was successfull.  Confirm the result
8660              cpu and ISA flags.  */
8661           *res = cpu;
8662           *isa_flags = isa_temp;
8663           return AARCH64_PARSE_OK;
8664         }
8665     }
8666
8667   /* CPU name not found in list.  */
8668   return AARCH64_PARSE_INVALID_ARG;
8669 }
8670
8671 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8672    Return an aarch64_parse_opt_result describing the parse result.
8673    If the parsing fails the RES does not change.  */
8674
8675 static enum aarch64_parse_opt_result
8676 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8677 {
8678   const struct processor *cpu;
8679   char *str = (char *) alloca (strlen (to_parse) + 1);
8680
8681   strcpy (str, to_parse);
8682
8683   /* Loop through the list of supported CPUs to find a match.  */
8684   for (cpu = all_cores; cpu->name != NULL; cpu++)
8685     {
8686       if (strcmp (cpu->name, str) == 0)
8687         {
8688           *res = cpu;
8689           return AARCH64_PARSE_OK;
8690         }
8691     }
8692
8693   /* CPU name not found in list.  */
8694   return AARCH64_PARSE_INVALID_ARG;
8695 }
8696
8697 /* Parse TOKEN, which has length LENGTH to see if it is an option
8698    described in FLAG.  If it is, return the index bit for that fusion type.
8699    If not, error (printing OPTION_NAME) and return zero.  */
8700
8701 static unsigned int
8702 aarch64_parse_one_option_token (const char *token,
8703                                 size_t length,
8704                                 const struct aarch64_flag_desc *flag,
8705                                 const char *option_name)
8706 {
8707   for (; flag->name != NULL; flag++)
8708     {
8709       if (length == strlen (flag->name)
8710           && !strncmp (flag->name, token, length))
8711         return flag->flag;
8712     }
8713
8714   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8715   return 0;
8716 }
8717
8718 /* Parse OPTION which is a comma-separated list of flags to enable.
8719    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8720    default state we inherit from the CPU tuning structures.  OPTION_NAME
8721    gives the top-level option we are parsing in the -moverride string,
8722    for use in error messages.  */
8723
8724 static unsigned int
8725 aarch64_parse_boolean_options (const char *option,
8726                                const struct aarch64_flag_desc *flags,
8727                                unsigned int initial_state,
8728                                const char *option_name)
8729 {
8730   const char separator = '.';
8731   const char* specs = option;
8732   const char* ntoken = option;
8733   unsigned int found_flags = initial_state;
8734
8735   while ((ntoken = strchr (specs, separator)))
8736     {
8737       size_t token_length = ntoken - specs;
8738       unsigned token_ops = aarch64_parse_one_option_token (specs,
8739                                                            token_length,
8740                                                            flags,
8741                                                            option_name);
8742       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8743          in the token stream, reset the supported operations.  So:
8744
8745            adrp+add.cmp+branch.none.adrp+add
8746
8747            would have the result of turning on only adrp+add fusion.  */
8748       if (!token_ops)
8749         found_flags = 0;
8750
8751       found_flags |= token_ops;
8752       specs = ++ntoken;
8753     }
8754
8755   /* We ended with a comma, print something.  */
8756   if (!(*specs))
8757     {
8758       error ("%s string ill-formed\n", option_name);
8759       return 0;
8760     }
8761
8762   /* We still have one more token to parse.  */
8763   size_t token_length = strlen (specs);
8764   unsigned token_ops = aarch64_parse_one_option_token (specs,
8765                                                        token_length,
8766                                                        flags,
8767                                                        option_name);
8768    if (!token_ops)
8769      found_flags = 0;
8770
8771   found_flags |= token_ops;
8772   return found_flags;
8773 }
8774
8775 /* Support for overriding instruction fusion.  */
8776
8777 static void
8778 aarch64_parse_fuse_string (const char *fuse_string,
8779                             struct tune_params *tune)
8780 {
8781   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8782                                                      aarch64_fusible_pairs,
8783                                                      tune->fusible_ops,
8784                                                      "fuse=");
8785 }
8786
8787 /* Support for overriding other tuning flags.  */
8788
8789 static void
8790 aarch64_parse_tune_string (const char *tune_string,
8791                             struct tune_params *tune)
8792 {
8793   tune->extra_tuning_flags
8794     = aarch64_parse_boolean_options (tune_string,
8795                                      aarch64_tuning_flags,
8796                                      tune->extra_tuning_flags,
8797                                      "tune=");
8798 }
8799
8800 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8801    we understand.  If it is, extract the option string and handoff to
8802    the appropriate function.  */
8803
8804 void
8805 aarch64_parse_one_override_token (const char* token,
8806                                   size_t length,
8807                                   struct tune_params *tune)
8808 {
8809   const struct aarch64_tuning_override_function *fn
8810     = aarch64_tuning_override_functions;
8811
8812   const char *option_part = strchr (token, '=');
8813   if (!option_part)
8814     {
8815       error ("tuning string missing in option (%s)", token);
8816       return;
8817     }
8818
8819   /* Get the length of the option name.  */
8820   length = option_part - token;
8821   /* Skip the '=' to get to the option string.  */
8822   option_part++;
8823
8824   for (; fn->name != NULL; fn++)
8825     {
8826       if (!strncmp (fn->name, token, length))
8827         {
8828           fn->parse_override (option_part, tune);
8829           return;
8830         }
8831     }
8832
8833   error ("unknown tuning option (%s)",token);
8834   return;
8835 }
8836
8837 /* A checking mechanism for the implementation of the tls size.  */
8838
8839 static void
8840 initialize_aarch64_tls_size (struct gcc_options *opts)
8841 {
8842   if (aarch64_tls_size == 0)
8843     aarch64_tls_size = 24;
8844
8845   switch (opts->x_aarch64_cmodel_var)
8846     {
8847     case AARCH64_CMODEL_TINY:
8848       /* Both the default and maximum TLS size allowed under tiny is 1M which
8849          needs two instructions to address, so we clamp the size to 24.  */
8850       if (aarch64_tls_size > 24)
8851         aarch64_tls_size = 24;
8852       break;
8853     case AARCH64_CMODEL_SMALL:
8854       /* The maximum TLS size allowed under small is 4G.  */
8855       if (aarch64_tls_size > 32)
8856         aarch64_tls_size = 32;
8857       break;
8858     case AARCH64_CMODEL_LARGE:
8859       /* The maximum TLS size allowed under large is 16E.
8860          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8861       if (aarch64_tls_size > 48)
8862         aarch64_tls_size = 48;
8863       break;
8864     default:
8865       gcc_unreachable ();
8866     }
8867
8868   return;
8869 }
8870
8871 /* Parse STRING looking for options in the format:
8872      string     :: option:string
8873      option     :: name=substring
8874      name       :: {a-z}
8875      substring  :: defined by option.  */
8876
8877 static void
8878 aarch64_parse_override_string (const char* input_string,
8879                                struct tune_params* tune)
8880 {
8881   const char separator = ':';
8882   size_t string_length = strlen (input_string) + 1;
8883   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8884   char *string = string_root;
8885   strncpy (string, input_string, string_length);
8886   string[string_length - 1] = '\0';
8887
8888   char* ntoken = string;
8889
8890   while ((ntoken = strchr (string, separator)))
8891     {
8892       size_t token_length = ntoken - string;
8893       /* Make this substring look like a string.  */
8894       *ntoken = '\0';
8895       aarch64_parse_one_override_token (string, token_length, tune);
8896       string = ++ntoken;
8897     }
8898
8899   /* One last option to parse.  */
8900   aarch64_parse_one_override_token (string, strlen (string), tune);
8901   free (string_root);
8902 }
8903
8904
8905 static void
8906 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8907 {
8908   /* The logic here is that if we are disabling all frame pointer generation
8909      then we do not need to disable leaf frame pointer generation as a
8910      separate operation.  But if we are *only* disabling leaf frame pointer
8911      generation then we set flag_omit_frame_pointer to true, but in
8912      aarch64_frame_pointer_required we return false only for leaf functions.
8913
8914      PR 70044: We have to be careful about being called multiple times for the
8915      same function.  Once we have decided to set flag_omit_frame_pointer just
8916      so that we can omit leaf frame pointers, we must then not interpret a
8917      second call as meaning that all frame pointer generation should be
8918      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8919      non-zero value.  */
8920   if (opts->x_flag_omit_frame_pointer == 2)
8921     opts->x_flag_omit_frame_pointer = 0;
8922
8923   if (opts->x_flag_omit_frame_pointer)
8924     opts->x_flag_omit_leaf_frame_pointer = false;
8925   else if (opts->x_flag_omit_leaf_frame_pointer)
8926     opts->x_flag_omit_frame_pointer = 2;
8927
8928   /* If not optimizing for size, set the default
8929      alignment to what the target wants.  */
8930   if (!opts->x_optimize_size)
8931     {
8932       if (opts->x_align_loops <= 0)
8933         opts->x_align_loops = aarch64_tune_params.loop_align;
8934       if (opts->x_align_jumps <= 0)
8935         opts->x_align_jumps = aarch64_tune_params.jump_align;
8936       if (opts->x_align_functions <= 0)
8937         opts->x_align_functions = aarch64_tune_params.function_align;
8938     }
8939
8940   /* We default to no pc-relative literal loads.  */
8941
8942   aarch64_pcrelative_literal_loads = false;
8943
8944   /* If -mpc-relative-literal-loads is set on the command line, this
8945      implies that the user asked for PC relative literal loads.  */
8946   if (opts->x_pcrelative_literal_loads == 1)
8947     aarch64_pcrelative_literal_loads = true;
8948
8949   /* This is PR70113. When building the Linux kernel with
8950      CONFIG_ARM64_ERRATUM_843419, support for relocations
8951      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8952      removed from the kernel to avoid loading objects with possibly
8953      offending sequences.  Without -mpc-relative-literal-loads we would
8954      generate such relocations, preventing the kernel build from
8955      succeeding.  */
8956   if (opts->x_pcrelative_literal_loads == 2
8957       && TARGET_FIX_ERR_A53_843419)
8958     aarch64_pcrelative_literal_loads = true;
8959
8960   /* In the tiny memory model it makes no sense to disallow PC relative
8961      literal pool loads.  */
8962   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8963       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8964     aarch64_pcrelative_literal_loads = true;
8965
8966   /* When enabling the lower precision Newton series for the square root, also
8967      enable it for the reciprocal square root, since the latter is an
8968      intermediary step for the former.  */
8969   if (flag_mlow_precision_sqrt)
8970     flag_mrecip_low_precision_sqrt = true;
8971 }
8972
8973 /* 'Unpack' up the internal tuning structs and update the options
8974     in OPTS.  The caller must have set up selected_tune and selected_arch
8975     as all the other target-specific codegen decisions are
8976     derived from them.  */
8977
8978 void
8979 aarch64_override_options_internal (struct gcc_options *opts)
8980 {
8981   aarch64_tune_flags = selected_tune->flags;
8982   aarch64_tune = selected_tune->sched_core;
8983   /* Make a copy of the tuning parameters attached to the core, which
8984      we may later overwrite.  */
8985   aarch64_tune_params = *(selected_tune->tune);
8986   aarch64_architecture_version = selected_arch->architecture_version;
8987
8988   if (opts->x_aarch64_override_tune_string)
8989     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8990                                   &aarch64_tune_params);
8991
8992   /* This target defaults to strict volatile bitfields.  */
8993   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8994     opts->x_flag_strict_volatile_bitfields = 1;
8995
8996   initialize_aarch64_code_model (opts);
8997   initialize_aarch64_tls_size (opts);
8998
8999   int queue_depth = 0;
9000   switch (aarch64_tune_params.autoprefetcher_model)
9001     {
9002       case tune_params::AUTOPREFETCHER_OFF:
9003         queue_depth = -1;
9004         break;
9005       case tune_params::AUTOPREFETCHER_WEAK:
9006         queue_depth = 0;
9007         break;
9008       case tune_params::AUTOPREFETCHER_STRONG:
9009         queue_depth = max_insn_queue_index + 1;
9010         break;
9011       default:
9012         gcc_unreachable ();
9013     }
9014
9015   /* We don't mind passing in global_options_set here as we don't use
9016      the *options_set structs anyway.  */
9017   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9018                          queue_depth,
9019                          opts->x_param_values,
9020                          global_options_set.x_param_values);
9021
9022   /* Set up parameters to be used in prefetching algorithm.  Do not
9023      override the defaults unless we are tuning for a core we have
9024      researched values for.  */
9025   if (aarch64_tune_params.prefetch->num_slots > 0)
9026     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9027                            aarch64_tune_params.prefetch->num_slots,
9028                            opts->x_param_values,
9029                            global_options_set.x_param_values);
9030   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9031     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9032                            aarch64_tune_params.prefetch->l1_cache_size,
9033                            opts->x_param_values,
9034                            global_options_set.x_param_values);
9035   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9036     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9037                            aarch64_tune_params.prefetch->l1_cache_line_size,
9038                            opts->x_param_values,
9039                            global_options_set.x_param_values);
9040   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9041     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9042                            aarch64_tune_params.prefetch->l2_cache_size,
9043                            opts->x_param_values,
9044                            global_options_set.x_param_values);
9045
9046   /* Enable sw prefetching at specified optimization level for
9047      CPUS that have prefetch.  Lower optimization level threshold by 1
9048      when profiling is enabled.  */
9049   if (opts->x_flag_prefetch_loop_arrays < 0
9050       && !opts->x_optimize_size
9051       && aarch64_tune_params.prefetch->default_opt_level >= 0
9052       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9053     opts->x_flag_prefetch_loop_arrays = 1;
9054
9055   aarch64_override_options_after_change_1 (opts);
9056 }
9057
9058 /* Print a hint with a suggestion for a core or architecture name that
9059    most closely resembles what the user passed in STR.  ARCH is true if
9060    the user is asking for an architecture name.  ARCH is false if the user
9061    is asking for a core name.  */
9062
9063 static void
9064 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9065 {
9066   auto_vec<const char *> candidates;
9067   const struct processor *entry = arch ? all_architectures : all_cores;
9068   for (; entry->name != NULL; entry++)
9069     candidates.safe_push (entry->name);
9070   char *s;
9071   const char *hint = candidates_list_and_hint (str, s, candidates);
9072   if (hint)
9073     inform (input_location, "valid arguments are: %s;"
9074                              " did you mean %qs?", s, hint);
9075   XDELETEVEC (s);
9076 }
9077
9078 /* Print a hint with a suggestion for a core name that most closely resembles
9079    what the user passed in STR.  */
9080
9081 inline static void
9082 aarch64_print_hint_for_core (const char *str)
9083 {
9084   aarch64_print_hint_for_core_or_arch (str, false);
9085 }
9086
9087 /* Print a hint with a suggestion for an architecture name that most closely
9088    resembles what the user passed in STR.  */
9089
9090 inline static void
9091 aarch64_print_hint_for_arch (const char *str)
9092 {
9093   aarch64_print_hint_for_core_or_arch (str, true);
9094 }
9095
9096 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9097    specified in STR and throw errors if appropriate.  Put the results if
9098    they are valid in RES and ISA_FLAGS.  Return whether the option is
9099    valid.  */
9100
9101 static bool
9102 aarch64_validate_mcpu (const char *str, const struct processor **res,
9103                        unsigned long *isa_flags)
9104 {
9105   enum aarch64_parse_opt_result parse_res
9106     = aarch64_parse_cpu (str, res, isa_flags);
9107
9108   if (parse_res == AARCH64_PARSE_OK)
9109     return true;
9110
9111   switch (parse_res)
9112     {
9113       case AARCH64_PARSE_MISSING_ARG:
9114         error ("missing cpu name in %<-mcpu=%s%>", str);
9115         break;
9116       case AARCH64_PARSE_INVALID_ARG:
9117         error ("unknown value %qs for -mcpu", str);
9118         aarch64_print_hint_for_core (str);
9119         break;
9120       case AARCH64_PARSE_INVALID_FEATURE:
9121         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9122         break;
9123       default:
9124         gcc_unreachable ();
9125     }
9126
9127   return false;
9128 }
9129
9130 /* Validate a command-line -march option.  Parse the arch and extensions
9131    (if any) specified in STR and throw errors if appropriate.  Put the
9132    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9133    option is valid.  */
9134
9135 static bool
9136 aarch64_validate_march (const char *str, const struct processor **res,
9137                          unsigned long *isa_flags)
9138 {
9139   enum aarch64_parse_opt_result parse_res
9140     = aarch64_parse_arch (str, res, isa_flags);
9141
9142   if (parse_res == AARCH64_PARSE_OK)
9143     return true;
9144
9145   switch (parse_res)
9146     {
9147       case AARCH64_PARSE_MISSING_ARG:
9148         error ("missing arch name in %<-march=%s%>", str);
9149         break;
9150       case AARCH64_PARSE_INVALID_ARG:
9151         error ("unknown value %qs for -march", str);
9152         aarch64_print_hint_for_arch (str);
9153         break;
9154       case AARCH64_PARSE_INVALID_FEATURE:
9155         error ("invalid feature modifier in %<-march=%s%>", str);
9156         break;
9157       default:
9158         gcc_unreachable ();
9159     }
9160
9161   return false;
9162 }
9163
9164 /* Validate a command-line -mtune option.  Parse the cpu
9165    specified in STR and throw errors if appropriate.  Put the
9166    result, if it is valid, in RES.  Return whether the option is
9167    valid.  */
9168
9169 static bool
9170 aarch64_validate_mtune (const char *str, const struct processor **res)
9171 {
9172   enum aarch64_parse_opt_result parse_res
9173     = aarch64_parse_tune (str, res);
9174
9175   if (parse_res == AARCH64_PARSE_OK)
9176     return true;
9177
9178   switch (parse_res)
9179     {
9180       case AARCH64_PARSE_MISSING_ARG:
9181         error ("missing cpu name in %<-mtune=%s%>", str);
9182         break;
9183       case AARCH64_PARSE_INVALID_ARG:
9184         error ("unknown value %qs for -mtune", str);
9185         aarch64_print_hint_for_core (str);
9186         break;
9187       default:
9188         gcc_unreachable ();
9189     }
9190   return false;
9191 }
9192
9193 /* Return the CPU corresponding to the enum CPU.
9194    If it doesn't specify a cpu, return the default.  */
9195
9196 static const struct processor *
9197 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9198 {
9199   if (cpu != aarch64_none)
9200     return &all_cores[cpu];
9201
9202   /* The & 0x3f is to extract the bottom 6 bits that encode the
9203      default cpu as selected by the --with-cpu GCC configure option
9204      in config.gcc.
9205      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9206      flags mechanism should be reworked to make it more sane.  */
9207   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9208 }
9209
9210 /* Return the architecture corresponding to the enum ARCH.
9211    If it doesn't specify a valid architecture, return the default.  */
9212
9213 static const struct processor *
9214 aarch64_get_arch (enum aarch64_arch arch)
9215 {
9216   if (arch != aarch64_no_arch)
9217     return &all_architectures[arch];
9218
9219   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9220
9221   return &all_architectures[cpu->arch];
9222 }
9223
9224 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9225    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9226    tuning structs.  In particular it must set selected_tune and
9227    aarch64_isa_flags that define the available ISA features and tuning
9228    decisions.  It must also set selected_arch as this will be used to
9229    output the .arch asm tags for each function.  */
9230
9231 static void
9232 aarch64_override_options (void)
9233 {
9234   unsigned long cpu_isa = 0;
9235   unsigned long arch_isa = 0;
9236   aarch64_isa_flags = 0;
9237
9238   bool valid_cpu = true;
9239   bool valid_tune = true;
9240   bool valid_arch = true;
9241
9242   selected_cpu = NULL;
9243   selected_arch = NULL;
9244   selected_tune = NULL;
9245
9246   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9247      If either of -march or -mtune is given, they override their
9248      respective component of -mcpu.  */
9249   if (aarch64_cpu_string)
9250     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9251                                         &cpu_isa);
9252
9253   if (aarch64_arch_string)
9254     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9255                                           &arch_isa);
9256
9257   if (aarch64_tune_string)
9258     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9259
9260   /* If the user did not specify a processor, choose the default
9261      one for them.  This will be the CPU set during configuration using
9262      --with-cpu, otherwise it is "generic".  */
9263   if (!selected_cpu)
9264     {
9265       if (selected_arch)
9266         {
9267           selected_cpu = &all_cores[selected_arch->ident];
9268           aarch64_isa_flags = arch_isa;
9269           explicit_arch = selected_arch->arch;
9270         }
9271       else
9272         {
9273           /* Get default configure-time CPU.  */
9274           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9275           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9276         }
9277
9278       if (selected_tune)
9279         explicit_tune_core = selected_tune->ident;
9280     }
9281   /* If both -mcpu and -march are specified check that they are architecturally
9282      compatible, warn if they're not and prefer the -march ISA flags.  */
9283   else if (selected_arch)
9284     {
9285       if (selected_arch->arch != selected_cpu->arch)
9286         {
9287           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9288                        all_architectures[selected_cpu->arch].name,
9289                        selected_arch->name);
9290         }
9291       aarch64_isa_flags = arch_isa;
9292       explicit_arch = selected_arch->arch;
9293       explicit_tune_core = selected_tune ? selected_tune->ident
9294                                           : selected_cpu->ident;
9295     }
9296   else
9297     {
9298       /* -mcpu but no -march.  */
9299       aarch64_isa_flags = cpu_isa;
9300       explicit_tune_core = selected_tune ? selected_tune->ident
9301                                           : selected_cpu->ident;
9302       gcc_assert (selected_cpu);
9303       selected_arch = &all_architectures[selected_cpu->arch];
9304       explicit_arch = selected_arch->arch;
9305     }
9306
9307   /* Set the arch as well as we will need it when outputing
9308      the .arch directive in assembly.  */
9309   if (!selected_arch)
9310     {
9311       gcc_assert (selected_cpu);
9312       selected_arch = &all_architectures[selected_cpu->arch];
9313     }
9314
9315   if (!selected_tune)
9316     selected_tune = selected_cpu;
9317
9318 #ifndef HAVE_AS_MABI_OPTION
9319   /* The compiler may have been configured with 2.23.* binutils, which does
9320      not have support for ILP32.  */
9321   if (TARGET_ILP32)
9322     error ("Assembler does not support -mabi=ilp32");
9323 #endif
9324
9325   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9326     sorry ("Return address signing is only supported for -mabi=lp64");
9327
9328   /* Make sure we properly set up the explicit options.  */
9329   if ((aarch64_cpu_string && valid_cpu)
9330        || (aarch64_tune_string && valid_tune))
9331     gcc_assert (explicit_tune_core != aarch64_none);
9332
9333   if ((aarch64_cpu_string && valid_cpu)
9334        || (aarch64_arch_string && valid_arch))
9335     gcc_assert (explicit_arch != aarch64_no_arch);
9336
9337   aarch64_override_options_internal (&global_options);
9338
9339   /* Save these options as the default ones in case we push and pop them later
9340      while processing functions with potential target attributes.  */
9341   target_option_default_node = target_option_current_node
9342       = build_target_option_node (&global_options);
9343 }
9344
9345 /* Implement targetm.override_options_after_change.  */
9346
9347 static void
9348 aarch64_override_options_after_change (void)
9349 {
9350   aarch64_override_options_after_change_1 (&global_options);
9351 }
9352
9353 static struct machine_function *
9354 aarch64_init_machine_status (void)
9355 {
9356   struct machine_function *machine;
9357   machine = ggc_cleared_alloc<machine_function> ();
9358   return machine;
9359 }
9360
9361 void
9362 aarch64_init_expanders (void)
9363 {
9364   init_machine_status = aarch64_init_machine_status;
9365 }
9366
9367 /* A checking mechanism for the implementation of the various code models.  */
9368 static void
9369 initialize_aarch64_code_model (struct gcc_options *opts)
9370 {
9371    if (opts->x_flag_pic)
9372      {
9373        switch (opts->x_aarch64_cmodel_var)
9374          {
9375          case AARCH64_CMODEL_TINY:
9376            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9377            break;
9378          case AARCH64_CMODEL_SMALL:
9379 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9380            aarch64_cmodel = (flag_pic == 2
9381                              ? AARCH64_CMODEL_SMALL_PIC
9382                              : AARCH64_CMODEL_SMALL_SPIC);
9383 #else
9384            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9385 #endif
9386            break;
9387          case AARCH64_CMODEL_LARGE:
9388            sorry ("code model %qs with -f%s", "large",
9389                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9390            break;
9391          default:
9392            gcc_unreachable ();
9393          }
9394      }
9395    else
9396      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9397 }
9398
9399 /* Implement TARGET_OPTION_SAVE.  */
9400
9401 static void
9402 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9403 {
9404   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9405 }
9406
9407 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9408    using the information saved in PTR.  */
9409
9410 static void
9411 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9412 {
9413   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9414   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9415   opts->x_explicit_arch = ptr->x_explicit_arch;
9416   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9417   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9418
9419   aarch64_override_options_internal (opts);
9420 }
9421
9422 /* Implement TARGET_OPTION_PRINT.  */
9423
9424 static void
9425 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9426 {
9427   const struct processor *cpu
9428     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9429   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9430   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9431   std::string extension
9432     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9433
9434   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9435   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9436            arch->name, extension.c_str ());
9437 }
9438
9439 static GTY(()) tree aarch64_previous_fndecl;
9440
9441 void
9442 aarch64_reset_previous_fndecl (void)
9443 {
9444   aarch64_previous_fndecl = NULL;
9445 }
9446
9447 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9448    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9449    make sure optab availability predicates are recomputed when necessary.  */
9450
9451 void
9452 aarch64_save_restore_target_globals (tree new_tree)
9453 {
9454   if (TREE_TARGET_GLOBALS (new_tree))
9455     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9456   else if (new_tree == target_option_default_node)
9457     restore_target_globals (&default_target_globals);
9458   else
9459     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9460 }
9461
9462 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9463    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9464    of the function, if such exists.  This function may be called multiple
9465    times on a single function so use aarch64_previous_fndecl to avoid
9466    setting up identical state.  */
9467
9468 static void
9469 aarch64_set_current_function (tree fndecl)
9470 {
9471   if (!fndecl || fndecl == aarch64_previous_fndecl)
9472     return;
9473
9474   tree old_tree = (aarch64_previous_fndecl
9475                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9476                    : NULL_TREE);
9477
9478   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9479
9480   /* If current function has no attributes but the previous one did,
9481      use the default node.  */
9482   if (!new_tree && old_tree)
9483     new_tree = target_option_default_node;
9484
9485   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9486      the default have been handled by aarch64_save_restore_target_globals from
9487      aarch64_pragma_target_parse.  */
9488   if (old_tree == new_tree)
9489     return;
9490
9491   aarch64_previous_fndecl = fndecl;
9492
9493   /* First set the target options.  */
9494   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9495
9496   aarch64_save_restore_target_globals (new_tree);
9497 }
9498
9499 /* Enum describing the various ways we can handle attributes.
9500    In many cases we can reuse the generic option handling machinery.  */
9501
9502 enum aarch64_attr_opt_type
9503 {
9504   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9505   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9506   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9507   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9508 };
9509
9510 /* All the information needed to handle a target attribute.
9511    NAME is the name of the attribute.
9512    ATTR_TYPE specifies the type of behavior of the attribute as described
9513    in the definition of enum aarch64_attr_opt_type.
9514    ALLOW_NEG is true if the attribute supports a "no-" form.
9515    HANDLER is the function that takes the attribute string and whether
9516    it is a pragma or attribute and handles the option.  It is needed only
9517    when the ATTR_TYPE is aarch64_attr_custom.
9518    OPT_NUM is the enum specifying the option that the attribute modifies.
9519    This is needed for attributes that mirror the behavior of a command-line
9520    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9521    aarch64_attr_enum.  */
9522
9523 struct aarch64_attribute_info
9524 {
9525   const char *name;
9526   enum aarch64_attr_opt_type attr_type;
9527   bool allow_neg;
9528   bool (*handler) (const char *, const char *);
9529   enum opt_code opt_num;
9530 };
9531
9532 /* Handle the ARCH_STR argument to the arch= target attribute.
9533    PRAGMA_OR_ATTR is used in potential error messages.  */
9534
9535 static bool
9536 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9537 {
9538   const struct processor *tmp_arch = NULL;
9539   enum aarch64_parse_opt_result parse_res
9540     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9541
9542   if (parse_res == AARCH64_PARSE_OK)
9543     {
9544       gcc_assert (tmp_arch);
9545       selected_arch = tmp_arch;
9546       explicit_arch = selected_arch->arch;
9547       return true;
9548     }
9549
9550   switch (parse_res)
9551     {
9552       case AARCH64_PARSE_MISSING_ARG:
9553         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9554         break;
9555       case AARCH64_PARSE_INVALID_ARG:
9556         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9557         aarch64_print_hint_for_arch (str);
9558         break;
9559       case AARCH64_PARSE_INVALID_FEATURE:
9560         error ("invalid feature modifier %qs for 'arch' target %s",
9561                str, pragma_or_attr);
9562         break;
9563       default:
9564         gcc_unreachable ();
9565     }
9566
9567   return false;
9568 }
9569
9570 /* Handle the argument CPU_STR to the cpu= target attribute.
9571    PRAGMA_OR_ATTR is used in potential error messages.  */
9572
9573 static bool
9574 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9575 {
9576   const struct processor *tmp_cpu = NULL;
9577   enum aarch64_parse_opt_result parse_res
9578     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9579
9580   if (parse_res == AARCH64_PARSE_OK)
9581     {
9582       gcc_assert (tmp_cpu);
9583       selected_tune = tmp_cpu;
9584       explicit_tune_core = selected_tune->ident;
9585
9586       selected_arch = &all_architectures[tmp_cpu->arch];
9587       explicit_arch = selected_arch->arch;
9588       return true;
9589     }
9590
9591   switch (parse_res)
9592     {
9593       case AARCH64_PARSE_MISSING_ARG:
9594         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9595         break;
9596       case AARCH64_PARSE_INVALID_ARG:
9597         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9598         aarch64_print_hint_for_core (str);
9599         break;
9600       case AARCH64_PARSE_INVALID_FEATURE:
9601         error ("invalid feature modifier %qs for 'cpu' target %s",
9602                str, pragma_or_attr);
9603         break;
9604       default:
9605         gcc_unreachable ();
9606     }
9607
9608   return false;
9609 }
9610
9611 /* Handle the argument STR to the tune= target attribute.
9612    PRAGMA_OR_ATTR is used in potential error messages.  */
9613
9614 static bool
9615 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9616 {
9617   const struct processor *tmp_tune = NULL;
9618   enum aarch64_parse_opt_result parse_res
9619     = aarch64_parse_tune (str, &tmp_tune);
9620
9621   if (parse_res == AARCH64_PARSE_OK)
9622     {
9623       gcc_assert (tmp_tune);
9624       selected_tune = tmp_tune;
9625       explicit_tune_core = selected_tune->ident;
9626       return true;
9627     }
9628
9629   switch (parse_res)
9630     {
9631       case AARCH64_PARSE_INVALID_ARG:
9632         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9633         aarch64_print_hint_for_core (str);
9634         break;
9635       default:
9636         gcc_unreachable ();
9637     }
9638
9639   return false;
9640 }
9641
9642 /* Parse an architecture extensions target attribute string specified in STR.
9643    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9644    if successful.  Update aarch64_isa_flags to reflect the ISA features
9645    modified.
9646    PRAGMA_OR_ATTR is used in potential error messages.  */
9647
9648 static bool
9649 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9650 {
9651   enum aarch64_parse_opt_result parse_res;
9652   unsigned long isa_flags = aarch64_isa_flags;
9653
9654   /* We allow "+nothing" in the beginning to clear out all architectural
9655      features if the user wants to handpick specific features.  */
9656   if (strncmp ("+nothing", str, 8) == 0)
9657     {
9658       isa_flags = 0;
9659       str += 8;
9660     }
9661
9662   parse_res = aarch64_parse_extension (str, &isa_flags);
9663
9664   if (parse_res == AARCH64_PARSE_OK)
9665     {
9666       aarch64_isa_flags = isa_flags;
9667       return true;
9668     }
9669
9670   switch (parse_res)
9671     {
9672       case AARCH64_PARSE_MISSING_ARG:
9673         error ("missing feature modifier in target %s %qs",
9674                pragma_or_attr, str);
9675         break;
9676
9677       case AARCH64_PARSE_INVALID_FEATURE:
9678         error ("invalid feature modifier in target %s %qs",
9679                pragma_or_attr, str);
9680         break;
9681
9682       default:
9683         gcc_unreachable ();
9684     }
9685
9686  return false;
9687 }
9688
9689 /* The target attributes that we support.  On top of these we also support just
9690    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9691    handled explicitly in aarch64_process_one_target_attr.  */
9692
9693 static const struct aarch64_attribute_info aarch64_attributes[] =
9694 {
9695   { "general-regs-only", aarch64_attr_mask, false, NULL,
9696      OPT_mgeneral_regs_only },
9697   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9698      OPT_mfix_cortex_a53_835769 },
9699   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9700      OPT_mfix_cortex_a53_843419 },
9701   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9702   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9703   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9704      OPT_momit_leaf_frame_pointer },
9705   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9706   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9707      OPT_march_ },
9708   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9709   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9710      OPT_mtune_ },
9711   { "sign-return-address", aarch64_attr_enum, false, NULL,
9712      OPT_msign_return_address_ },
9713   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9714 };
9715
9716 /* Parse ARG_STR which contains the definition of one target attribute.
9717    Show appropriate errors if any or return true if the attribute is valid.
9718    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9719    we're processing a target attribute or pragma.  */
9720
9721 static bool
9722 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9723 {
9724   bool invert = false;
9725
9726   size_t len = strlen (arg_str);
9727
9728   if (len == 0)
9729     {
9730       error ("malformed target %s", pragma_or_attr);
9731       return false;
9732     }
9733
9734   char *str_to_check = (char *) alloca (len + 1);
9735   strcpy (str_to_check, arg_str);
9736
9737   /* Skip leading whitespace.  */
9738   while (*str_to_check == ' ' || *str_to_check == '\t')
9739     str_to_check++;
9740
9741   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9742      It is easier to detect and handle it explicitly here rather than going
9743      through the machinery for the rest of the target attributes in this
9744      function.  */
9745   if (*str_to_check == '+')
9746     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9747
9748   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9749     {
9750       invert = true;
9751       str_to_check += 3;
9752     }
9753   char *arg = strchr (str_to_check, '=');
9754
9755   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9756      and point ARG to "foo".  */
9757   if (arg)
9758     {
9759       *arg = '\0';
9760       arg++;
9761     }
9762   const struct aarch64_attribute_info *p_attr;
9763   bool found = false;
9764   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9765     {
9766       /* If the names don't match up, or the user has given an argument
9767          to an attribute that doesn't accept one, or didn't give an argument
9768          to an attribute that expects one, fail to match.  */
9769       if (strcmp (str_to_check, p_attr->name) != 0)
9770         continue;
9771
9772       found = true;
9773       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9774                               || p_attr->attr_type == aarch64_attr_enum;
9775
9776       if (attr_need_arg_p ^ (arg != NULL))
9777         {
9778           error ("target %s %qs does not accept an argument",
9779                   pragma_or_attr, str_to_check);
9780           return false;
9781         }
9782
9783       /* If the name matches but the attribute does not allow "no-" versions
9784          then we can't match.  */
9785       if (invert && !p_attr->allow_neg)
9786         {
9787           error ("target %s %qs does not allow a negated form",
9788                   pragma_or_attr, str_to_check);
9789           return false;
9790         }
9791
9792       switch (p_attr->attr_type)
9793         {
9794         /* Has a custom handler registered.
9795            For example, cpu=, arch=, tune=.  */
9796           case aarch64_attr_custom:
9797             gcc_assert (p_attr->handler);
9798             if (!p_attr->handler (arg, pragma_or_attr))
9799               return false;
9800             break;
9801
9802           /* Either set or unset a boolean option.  */
9803           case aarch64_attr_bool:
9804             {
9805               struct cl_decoded_option decoded;
9806
9807               generate_option (p_attr->opt_num, NULL, !invert,
9808                                CL_TARGET, &decoded);
9809               aarch64_handle_option (&global_options, &global_options_set,
9810                                       &decoded, input_location);
9811               break;
9812             }
9813           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9814              should know what mask to apply given the option number.  */
9815           case aarch64_attr_mask:
9816             {
9817               struct cl_decoded_option decoded;
9818               /* We only need to specify the option number.
9819                  aarch64_handle_option will know which mask to apply.  */
9820               decoded.opt_index = p_attr->opt_num;
9821               decoded.value = !invert;
9822               aarch64_handle_option (&global_options, &global_options_set,
9823                                       &decoded, input_location);
9824               break;
9825             }
9826           /* Use the option setting machinery to set an option to an enum.  */
9827           case aarch64_attr_enum:
9828             {
9829               gcc_assert (arg);
9830               bool valid;
9831               int value;
9832               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9833                                               &value, CL_TARGET);
9834               if (valid)
9835                 {
9836                   set_option (&global_options, NULL, p_attr->opt_num, value,
9837                               NULL, DK_UNSPECIFIED, input_location,
9838                               global_dc);
9839                 }
9840               else
9841                 {
9842                   error ("target %s %s=%s is not valid",
9843                          pragma_or_attr, str_to_check, arg);
9844                 }
9845               break;
9846             }
9847           default:
9848             gcc_unreachable ();
9849         }
9850     }
9851
9852   /* If we reached here we either have found an attribute and validated
9853      it or didn't match any.  If we matched an attribute but its arguments
9854      were malformed we will have returned false already.  */
9855   return found;
9856 }
9857
9858 /* Count how many times the character C appears in
9859    NULL-terminated string STR.  */
9860
9861 static unsigned int
9862 num_occurences_in_str (char c, char *str)
9863 {
9864   unsigned int res = 0;
9865   while (*str != '\0')
9866     {
9867       if (*str == c)
9868         res++;
9869
9870       str++;
9871     }
9872
9873   return res;
9874 }
9875
9876 /* Parse the tree in ARGS that contains the target attribute information
9877    and update the global target options space.  PRAGMA_OR_ATTR is a string
9878    to be used in error messages, specifying whether this is processing
9879    a target attribute or a target pragma.  */
9880
9881 bool
9882 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9883 {
9884   if (TREE_CODE (args) == TREE_LIST)
9885     {
9886       do
9887         {
9888           tree head = TREE_VALUE (args);
9889           if (head)
9890             {
9891               if (!aarch64_process_target_attr (head, pragma_or_attr))
9892                 return false;
9893             }
9894           args = TREE_CHAIN (args);
9895         } while (args);
9896
9897       return true;
9898     }
9899
9900   if (TREE_CODE (args) != STRING_CST)
9901     {
9902       error ("attribute %<target%> argument not a string");
9903       return false;
9904     }
9905
9906   size_t len = strlen (TREE_STRING_POINTER (args));
9907   char *str_to_check = (char *) alloca (len + 1);
9908   strcpy (str_to_check, TREE_STRING_POINTER (args));
9909
9910   if (len == 0)
9911     {
9912       error ("malformed target %s value", pragma_or_attr);
9913       return false;
9914     }
9915
9916   /* Used to catch empty spaces between commas i.e.
9917      attribute ((target ("attr1,,attr2"))).  */
9918   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9919
9920   /* Handle multiple target attributes separated by ','.  */
9921   char *token = strtok (str_to_check, ",");
9922
9923   unsigned int num_attrs = 0;
9924   while (token)
9925     {
9926       num_attrs++;
9927       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9928         {
9929           error ("target %s %qs is invalid", pragma_or_attr, token);
9930           return false;
9931         }
9932
9933       token = strtok (NULL, ",");
9934     }
9935
9936   if (num_attrs != num_commas + 1)
9937     {
9938       error ("malformed target %s list %qs",
9939               pragma_or_attr, TREE_STRING_POINTER (args));
9940       return false;
9941     }
9942
9943   return true;
9944 }
9945
9946 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9947    process attribute ((target ("..."))).  */
9948
9949 static bool
9950 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9951 {
9952   struct cl_target_option cur_target;
9953   bool ret;
9954   tree old_optimize;
9955   tree new_target, new_optimize;
9956   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9957
9958   /* If what we're processing is the current pragma string then the
9959      target option node is already stored in target_option_current_node
9960      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9961      having to re-parse the string.  This is especially useful to keep
9962      arm_neon.h compile times down since that header contains a lot
9963      of intrinsics enclosed in pragmas.  */
9964   if (!existing_target && args == current_target_pragma)
9965     {
9966       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9967       return true;
9968     }
9969   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9970
9971   old_optimize = build_optimization_node (&global_options);
9972   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9973
9974   /* If the function changed the optimization levels as well as setting
9975      target options, start with the optimizations specified.  */
9976   if (func_optimize && func_optimize != old_optimize)
9977     cl_optimization_restore (&global_options,
9978                              TREE_OPTIMIZATION (func_optimize));
9979
9980   /* Save the current target options to restore at the end.  */
9981   cl_target_option_save (&cur_target, &global_options);
9982
9983   /* If fndecl already has some target attributes applied to it, unpack
9984      them so that we add this attribute on top of them, rather than
9985      overwriting them.  */
9986   if (existing_target)
9987     {
9988       struct cl_target_option *existing_options
9989         = TREE_TARGET_OPTION (existing_target);
9990
9991       if (existing_options)
9992         cl_target_option_restore (&global_options, existing_options);
9993     }
9994   else
9995     cl_target_option_restore (&global_options,
9996                         TREE_TARGET_OPTION (target_option_current_node));
9997
9998
9999   ret = aarch64_process_target_attr (args, "attribute");
10000
10001   /* Set up any additional state.  */
10002   if (ret)
10003     {
10004       aarch64_override_options_internal (&global_options);
10005       /* Initialize SIMD builtins if we haven't already.
10006          Set current_target_pragma to NULL for the duration so that
10007          the builtin initialization code doesn't try to tag the functions
10008          being built with the attributes specified by any current pragma, thus
10009          going into an infinite recursion.  */
10010       if (TARGET_SIMD)
10011         {
10012           tree saved_current_target_pragma = current_target_pragma;
10013           current_target_pragma = NULL;
10014           aarch64_init_simd_builtins ();
10015           current_target_pragma = saved_current_target_pragma;
10016         }
10017       new_target = build_target_option_node (&global_options);
10018     }
10019   else
10020     new_target = NULL;
10021
10022   new_optimize = build_optimization_node (&global_options);
10023
10024   if (fndecl && ret)
10025     {
10026       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10027
10028       if (old_optimize != new_optimize)
10029         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10030     }
10031
10032   cl_target_option_restore (&global_options, &cur_target);
10033
10034   if (old_optimize != new_optimize)
10035     cl_optimization_restore (&global_options,
10036                              TREE_OPTIMIZATION (old_optimize));
10037   return ret;
10038 }
10039
10040 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10041    tri-bool options (yes, no, don't care) and the default value is
10042    DEF, determine whether to reject inlining.  */
10043
10044 static bool
10045 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10046                                      int dont_care, int def)
10047 {
10048   /* If the callee doesn't care, always allow inlining.  */
10049   if (callee == dont_care)
10050     return true;
10051
10052   /* If the caller doesn't care, always allow inlining.  */
10053   if (caller == dont_care)
10054     return true;
10055
10056   /* Otherwise, allow inlining if either the callee and caller values
10057      agree, or if the callee is using the default value.  */
10058   return (callee == caller || callee == def);
10059 }
10060
10061 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10062    to inline CALLEE into CALLER based on target-specific info.
10063    Make sure that the caller and callee have compatible architectural
10064    features.  Then go through the other possible target attributes
10065    and see if they can block inlining.  Try not to reject always_inline
10066    callees unless they are incompatible architecturally.  */
10067
10068 static bool
10069 aarch64_can_inline_p (tree caller, tree callee)
10070 {
10071   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10072   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10073
10074   /* If callee has no option attributes, then it is ok to inline.  */
10075   if (!callee_tree)
10076     return true;
10077
10078   struct cl_target_option *caller_opts
10079         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10080                                            : target_option_default_node);
10081
10082   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10083
10084
10085   /* Callee's ISA flags should be a subset of the caller's.  */
10086   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10087        != callee_opts->x_aarch64_isa_flags)
10088     return false;
10089
10090   /* Allow non-strict aligned functions inlining into strict
10091      aligned ones.  */
10092   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10093        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10094       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10095            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10096     return false;
10097
10098   bool always_inline = lookup_attribute ("always_inline",
10099                                           DECL_ATTRIBUTES (callee));
10100
10101   /* If the architectural features match up and the callee is always_inline
10102      then the other attributes don't matter.  */
10103   if (always_inline)
10104     return true;
10105
10106   if (caller_opts->x_aarch64_cmodel_var
10107       != callee_opts->x_aarch64_cmodel_var)
10108     return false;
10109
10110   if (caller_opts->x_aarch64_tls_dialect
10111       != callee_opts->x_aarch64_tls_dialect)
10112     return false;
10113
10114   /* Honour explicit requests to workaround errata.  */
10115   if (!aarch64_tribools_ok_for_inlining_p (
10116           caller_opts->x_aarch64_fix_a53_err835769,
10117           callee_opts->x_aarch64_fix_a53_err835769,
10118           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10119     return false;
10120
10121   if (!aarch64_tribools_ok_for_inlining_p (
10122           caller_opts->x_aarch64_fix_a53_err843419,
10123           callee_opts->x_aarch64_fix_a53_err843419,
10124           2, TARGET_FIX_ERR_A53_843419))
10125     return false;
10126
10127   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10128      caller and calle and they don't match up, reject inlining.  */
10129   if (!aarch64_tribools_ok_for_inlining_p (
10130           caller_opts->x_flag_omit_leaf_frame_pointer,
10131           callee_opts->x_flag_omit_leaf_frame_pointer,
10132           2, 1))
10133     return false;
10134
10135   /* If the callee has specific tuning overrides, respect them.  */
10136   if (callee_opts->x_aarch64_override_tune_string != NULL
10137       && caller_opts->x_aarch64_override_tune_string == NULL)
10138     return false;
10139
10140   /* If the user specified tuning override strings for the
10141      caller and callee and they don't match up, reject inlining.
10142      We just do a string compare here, we don't analyze the meaning
10143      of the string, as it would be too costly for little gain.  */
10144   if (callee_opts->x_aarch64_override_tune_string
10145       && caller_opts->x_aarch64_override_tune_string
10146       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10147                   caller_opts->x_aarch64_override_tune_string) != 0))
10148     return false;
10149
10150   return true;
10151 }
10152
10153 /* Return true if SYMBOL_REF X binds locally.  */
10154
10155 static bool
10156 aarch64_symbol_binds_local_p (const_rtx x)
10157 {
10158   return (SYMBOL_REF_DECL (x)
10159           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10160           : SYMBOL_REF_LOCAL_P (x));
10161 }
10162
10163 /* Return true if SYMBOL_REF X is thread local */
10164 static bool
10165 aarch64_tls_symbol_p (rtx x)
10166 {
10167   if (! TARGET_HAVE_TLS)
10168     return false;
10169
10170   if (GET_CODE (x) != SYMBOL_REF)
10171     return false;
10172
10173   return SYMBOL_REF_TLS_MODEL (x) != 0;
10174 }
10175
10176 /* Classify a TLS symbol into one of the TLS kinds.  */
10177 enum aarch64_symbol_type
10178 aarch64_classify_tls_symbol (rtx x)
10179 {
10180   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10181
10182   switch (tls_kind)
10183     {
10184     case TLS_MODEL_GLOBAL_DYNAMIC:
10185     case TLS_MODEL_LOCAL_DYNAMIC:
10186       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10187
10188     case TLS_MODEL_INITIAL_EXEC:
10189       switch (aarch64_cmodel)
10190         {
10191         case AARCH64_CMODEL_TINY:
10192         case AARCH64_CMODEL_TINY_PIC:
10193           return SYMBOL_TINY_TLSIE;
10194         default:
10195           return SYMBOL_SMALL_TLSIE;
10196         }
10197
10198     case TLS_MODEL_LOCAL_EXEC:
10199       if (aarch64_tls_size == 12)
10200         return SYMBOL_TLSLE12;
10201       else if (aarch64_tls_size == 24)
10202         return SYMBOL_TLSLE24;
10203       else if (aarch64_tls_size == 32)
10204         return SYMBOL_TLSLE32;
10205       else if (aarch64_tls_size == 48)
10206         return SYMBOL_TLSLE48;
10207       else
10208         gcc_unreachable ();
10209
10210     case TLS_MODEL_EMULATED:
10211     case TLS_MODEL_NONE:
10212       return SYMBOL_FORCE_TO_MEM;
10213
10214     default:
10215       gcc_unreachable ();
10216     }
10217 }
10218
10219 /* Return the method that should be used to access SYMBOL_REF or
10220    LABEL_REF X.  */
10221
10222 enum aarch64_symbol_type
10223 aarch64_classify_symbol (rtx x, rtx offset)
10224 {
10225   if (GET_CODE (x) == LABEL_REF)
10226     {
10227       switch (aarch64_cmodel)
10228         {
10229         case AARCH64_CMODEL_LARGE:
10230           return SYMBOL_FORCE_TO_MEM;
10231
10232         case AARCH64_CMODEL_TINY_PIC:
10233         case AARCH64_CMODEL_TINY:
10234           return SYMBOL_TINY_ABSOLUTE;
10235
10236         case AARCH64_CMODEL_SMALL_SPIC:
10237         case AARCH64_CMODEL_SMALL_PIC:
10238         case AARCH64_CMODEL_SMALL:
10239           return SYMBOL_SMALL_ABSOLUTE;
10240
10241         default:
10242           gcc_unreachable ();
10243         }
10244     }
10245
10246   if (GET_CODE (x) == SYMBOL_REF)
10247     {
10248       if (aarch64_tls_symbol_p (x))
10249         return aarch64_classify_tls_symbol (x);
10250
10251       switch (aarch64_cmodel)
10252         {
10253         case AARCH64_CMODEL_TINY:
10254           /* When we retrieve symbol + offset address, we have to make sure
10255              the offset does not cause overflow of the final address.  But
10256              we have no way of knowing the address of symbol at compile time
10257              so we can't accurately say if the distance between the PC and
10258              symbol + offset is outside the addressible range of +/-1M in the
10259              TINY code model.  So we rely on images not being greater than
10260              1M and cap the offset at 1M and anything beyond 1M will have to
10261              be loaded using an alternative mechanism.  Furthermore if the
10262              symbol is a weak reference to something that isn't known to
10263              resolve to a symbol in this module, then force to memory.  */
10264           if ((SYMBOL_REF_WEAK (x)
10265                && !aarch64_symbol_binds_local_p (x))
10266               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10267             return SYMBOL_FORCE_TO_MEM;
10268           return SYMBOL_TINY_ABSOLUTE;
10269
10270         case AARCH64_CMODEL_SMALL:
10271           /* Same reasoning as the tiny code model, but the offset cap here is
10272              4G.  */
10273           if ((SYMBOL_REF_WEAK (x)
10274                && !aarch64_symbol_binds_local_p (x))
10275               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10276                             HOST_WIDE_INT_C (4294967264)))
10277             return SYMBOL_FORCE_TO_MEM;
10278           return SYMBOL_SMALL_ABSOLUTE;
10279
10280         case AARCH64_CMODEL_TINY_PIC:
10281           if (!aarch64_symbol_binds_local_p (x))
10282             return SYMBOL_TINY_GOT;
10283           return SYMBOL_TINY_ABSOLUTE;
10284
10285         case AARCH64_CMODEL_SMALL_SPIC:
10286         case AARCH64_CMODEL_SMALL_PIC:
10287           if (!aarch64_symbol_binds_local_p (x))
10288             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10289                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10290           return SYMBOL_SMALL_ABSOLUTE;
10291
10292         case AARCH64_CMODEL_LARGE:
10293           /* This is alright even in PIC code as the constant
10294              pool reference is always PC relative and within
10295              the same translation unit.  */
10296           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10297             return SYMBOL_SMALL_ABSOLUTE;
10298           else
10299             return SYMBOL_FORCE_TO_MEM;
10300
10301         default:
10302           gcc_unreachable ();
10303         }
10304     }
10305
10306   /* By default push everything into the constant pool.  */
10307   return SYMBOL_FORCE_TO_MEM;
10308 }
10309
10310 bool
10311 aarch64_constant_address_p (rtx x)
10312 {
10313   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10314 }
10315
10316 bool
10317 aarch64_legitimate_pic_operand_p (rtx x)
10318 {
10319   if (GET_CODE (x) == SYMBOL_REF
10320       || (GET_CODE (x) == CONST
10321           && GET_CODE (XEXP (x, 0)) == PLUS
10322           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10323      return false;
10324
10325   return true;
10326 }
10327
10328 /* Return true if X holds either a quarter-precision or
10329      floating-point +0.0 constant.  */
10330 static bool
10331 aarch64_valid_floating_const (rtx x)
10332 {
10333   if (!CONST_DOUBLE_P (x))
10334     return false;
10335
10336   /* This call determines which constants can be used in mov<mode>
10337      as integer moves instead of constant loads.  */
10338   if (aarch64_float_const_rtx_p (x))
10339     return true;
10340
10341   return aarch64_float_const_representable_p (x);
10342 }
10343
10344 static bool
10345 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10346 {
10347   /* Do not allow vector struct mode constants.  We could support
10348      0 and -1 easily, but they need support in aarch64-simd.md.  */
10349   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10350     return false;
10351
10352   /* For these cases we never want to use a literal load.
10353      As such we have to prevent the compiler from forcing these
10354      to memory.  */
10355   if ((GET_CODE (x) == CONST_VECTOR
10356        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10357       || CONST_INT_P (x)
10358       || aarch64_valid_floating_const (x)
10359       || aarch64_can_const_movi_rtx_p (x, mode)
10360       || aarch64_float_const_rtx_p (x))
10361         return !targetm.cannot_force_const_mem (mode, x);
10362
10363   if (GET_CODE (x) == HIGH
10364       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10365     return true;
10366
10367   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10368      so spilling them is better than rematerialization.  */
10369   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10370     return true;
10371
10372   return aarch64_constant_address_p (x);
10373 }
10374
10375 rtx
10376 aarch64_load_tp (rtx target)
10377 {
10378   if (!target
10379       || GET_MODE (target) != Pmode
10380       || !register_operand (target, Pmode))
10381     target = gen_reg_rtx (Pmode);
10382
10383   /* Can return in any reg.  */
10384   emit_insn (gen_aarch64_load_tp_hard (target));
10385   return target;
10386 }
10387
10388 /* On AAPCS systems, this is the "struct __va_list".  */
10389 static GTY(()) tree va_list_type;
10390
10391 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10392    Return the type to use as __builtin_va_list.
10393
10394    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10395
10396    struct __va_list
10397    {
10398      void *__stack;
10399      void *__gr_top;
10400      void *__vr_top;
10401      int   __gr_offs;
10402      int   __vr_offs;
10403    };  */
10404
10405 static tree
10406 aarch64_build_builtin_va_list (void)
10407 {
10408   tree va_list_name;
10409   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10410
10411   /* Create the type.  */
10412   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10413   /* Give it the required name.  */
10414   va_list_name = build_decl (BUILTINS_LOCATION,
10415                              TYPE_DECL,
10416                              get_identifier ("__va_list"),
10417                              va_list_type);
10418   DECL_ARTIFICIAL (va_list_name) = 1;
10419   TYPE_NAME (va_list_type) = va_list_name;
10420   TYPE_STUB_DECL (va_list_type) = va_list_name;
10421
10422   /* Create the fields.  */
10423   f_stack = build_decl (BUILTINS_LOCATION,
10424                         FIELD_DECL, get_identifier ("__stack"),
10425                         ptr_type_node);
10426   f_grtop = build_decl (BUILTINS_LOCATION,
10427                         FIELD_DECL, get_identifier ("__gr_top"),
10428                         ptr_type_node);
10429   f_vrtop = build_decl (BUILTINS_LOCATION,
10430                         FIELD_DECL, get_identifier ("__vr_top"),
10431                         ptr_type_node);
10432   f_groff = build_decl (BUILTINS_LOCATION,
10433                         FIELD_DECL, get_identifier ("__gr_offs"),
10434                         integer_type_node);
10435   f_vroff = build_decl (BUILTINS_LOCATION,
10436                         FIELD_DECL, get_identifier ("__vr_offs"),
10437                         integer_type_node);
10438
10439   /* Tell tree-stdarg pass about our internal offset fields.
10440      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10441      purpose to identify whether the code is updating va_list internal
10442      offset fields through irregular way.  */
10443   va_list_gpr_counter_field = f_groff;
10444   va_list_fpr_counter_field = f_vroff;
10445
10446   DECL_ARTIFICIAL (f_stack) = 1;
10447   DECL_ARTIFICIAL (f_grtop) = 1;
10448   DECL_ARTIFICIAL (f_vrtop) = 1;
10449   DECL_ARTIFICIAL (f_groff) = 1;
10450   DECL_ARTIFICIAL (f_vroff) = 1;
10451
10452   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10453   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10454   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10455   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10456   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10457
10458   TYPE_FIELDS (va_list_type) = f_stack;
10459   DECL_CHAIN (f_stack) = f_grtop;
10460   DECL_CHAIN (f_grtop) = f_vrtop;
10461   DECL_CHAIN (f_vrtop) = f_groff;
10462   DECL_CHAIN (f_groff) = f_vroff;
10463
10464   /* Compute its layout.  */
10465   layout_type (va_list_type);
10466
10467   return va_list_type;
10468 }
10469
10470 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10471 static void
10472 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10473 {
10474   const CUMULATIVE_ARGS *cum;
10475   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10476   tree stack, grtop, vrtop, groff, vroff;
10477   tree t;
10478   int gr_save_area_size = cfun->va_list_gpr_size;
10479   int vr_save_area_size = cfun->va_list_fpr_size;
10480   int vr_offset;
10481
10482   cum = &crtl->args.info;
10483   if (cfun->va_list_gpr_size)
10484     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10485                              cfun->va_list_gpr_size);
10486   if (cfun->va_list_fpr_size)
10487     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10488                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10489
10490   if (!TARGET_FLOAT)
10491     {
10492       gcc_assert (cum->aapcs_nvrn == 0);
10493       vr_save_area_size = 0;
10494     }
10495
10496   f_stack = TYPE_FIELDS (va_list_type_node);
10497   f_grtop = DECL_CHAIN (f_stack);
10498   f_vrtop = DECL_CHAIN (f_grtop);
10499   f_groff = DECL_CHAIN (f_vrtop);
10500   f_vroff = DECL_CHAIN (f_groff);
10501
10502   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10503                   NULL_TREE);
10504   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10505                   NULL_TREE);
10506   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10507                   NULL_TREE);
10508   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10509                   NULL_TREE);
10510   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10511                   NULL_TREE);
10512
10513   /* Emit code to initialize STACK, which points to the next varargs stack
10514      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10515      by named arguments.  STACK is 8-byte aligned.  */
10516   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10517   if (cum->aapcs_stack_size > 0)
10518     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10519   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10520   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10521
10522   /* Emit code to initialize GRTOP, the top of the GR save area.
10523      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10524   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10525   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10526   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10527
10528   /* Emit code to initialize VRTOP, the top of the VR save area.
10529      This address is gr_save_area_bytes below GRTOP, rounded
10530      down to the next 16-byte boundary.  */
10531   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10532   vr_offset = ROUND_UP (gr_save_area_size,
10533                         STACK_BOUNDARY / BITS_PER_UNIT);
10534
10535   if (vr_offset)
10536     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10537   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10538   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10539
10540   /* Emit code to initialize GROFF, the offset from GRTOP of the
10541      next GPR argument.  */
10542   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10543               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10544   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10545
10546   /* Likewise emit code to initialize VROFF, the offset from FTOP
10547      of the next VR argument.  */
10548   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10549               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10550   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10551 }
10552
10553 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10554
10555 static tree
10556 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10557                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10558 {
10559   tree addr;
10560   bool indirect_p;
10561   bool is_ha;           /* is HFA or HVA.  */
10562   bool dw_align;        /* double-word align.  */
10563   machine_mode ag_mode = VOIDmode;
10564   int nregs;
10565   machine_mode mode;
10566
10567   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10568   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10569   HOST_WIDE_INT size, rsize, adjust, align;
10570   tree t, u, cond1, cond2;
10571
10572   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10573   if (indirect_p)
10574     type = build_pointer_type (type);
10575
10576   mode = TYPE_MODE (type);
10577
10578   f_stack = TYPE_FIELDS (va_list_type_node);
10579   f_grtop = DECL_CHAIN (f_stack);
10580   f_vrtop = DECL_CHAIN (f_grtop);
10581   f_groff = DECL_CHAIN (f_vrtop);
10582   f_vroff = DECL_CHAIN (f_groff);
10583
10584   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10585                   f_stack, NULL_TREE);
10586   size = int_size_in_bytes (type);
10587   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10588
10589   dw_align = false;
10590   adjust = 0;
10591   if (aarch64_vfp_is_call_or_return_candidate (mode,
10592                                                type,
10593                                                &ag_mode,
10594                                                &nregs,
10595                                                &is_ha))
10596     {
10597       /* TYPE passed in fp/simd registers.  */
10598       if (!TARGET_FLOAT)
10599         aarch64_err_no_fpadvsimd (mode, "varargs");
10600
10601       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10602                       unshare_expr (valist), f_vrtop, NULL_TREE);
10603       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10604                       unshare_expr (valist), f_vroff, NULL_TREE);
10605
10606       rsize = nregs * UNITS_PER_VREG;
10607
10608       if (is_ha)
10609         {
10610           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10611             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10612         }
10613       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10614                && size < UNITS_PER_VREG)
10615         {
10616           adjust = UNITS_PER_VREG - size;
10617         }
10618     }
10619   else
10620     {
10621       /* TYPE passed in general registers.  */
10622       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10623                       unshare_expr (valist), f_grtop, NULL_TREE);
10624       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10625                       unshare_expr (valist), f_groff, NULL_TREE);
10626       rsize = ROUND_UP (size, UNITS_PER_WORD);
10627       nregs = rsize / UNITS_PER_WORD;
10628
10629       if (align > 8)
10630         dw_align = true;
10631
10632       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10633           && size < UNITS_PER_WORD)
10634         {
10635           adjust = UNITS_PER_WORD  - size;
10636         }
10637     }
10638
10639   /* Get a local temporary for the field value.  */
10640   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10641
10642   /* Emit code to branch if off >= 0.  */
10643   t = build2 (GE_EXPR, boolean_type_node, off,
10644               build_int_cst (TREE_TYPE (off), 0));
10645   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10646
10647   if (dw_align)
10648     {
10649       /* Emit: offs = (offs + 15) & -16.  */
10650       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10651                   build_int_cst (TREE_TYPE (off), 15));
10652       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10653                   build_int_cst (TREE_TYPE (off), -16));
10654       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10655     }
10656   else
10657     roundup = NULL;
10658
10659   /* Update ap.__[g|v]r_offs  */
10660   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10661               build_int_cst (TREE_TYPE (off), rsize));
10662   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10663
10664   /* String up.  */
10665   if (roundup)
10666     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10667
10668   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10669   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10670               build_int_cst (TREE_TYPE (f_off), 0));
10671   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10672
10673   /* String up: make sure the assignment happens before the use.  */
10674   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10675   COND_EXPR_ELSE (cond1) = t;
10676
10677   /* Prepare the trees handling the argument that is passed on the stack;
10678      the top level node will store in ON_STACK.  */
10679   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10680   if (align > 8)
10681     {
10682       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10683       t = fold_convert (intDI_type_node, arg);
10684       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10685                   build_int_cst (TREE_TYPE (t), 15));
10686       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10687                   build_int_cst (TREE_TYPE (t), -16));
10688       t = fold_convert (TREE_TYPE (arg), t);
10689       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10690     }
10691   else
10692     roundup = NULL;
10693   /* Advance ap.__stack  */
10694   t = fold_convert (intDI_type_node, arg);
10695   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10696               build_int_cst (TREE_TYPE (t), size + 7));
10697   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10698               build_int_cst (TREE_TYPE (t), -8));
10699   t = fold_convert (TREE_TYPE (arg), t);
10700   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10701   /* String up roundup and advance.  */
10702   if (roundup)
10703     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10704   /* String up with arg */
10705   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10706   /* Big-endianness related address adjustment.  */
10707   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10708       && size < UNITS_PER_WORD)
10709   {
10710     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10711                 size_int (UNITS_PER_WORD - size));
10712     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10713   }
10714
10715   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10716   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10717
10718   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10719   t = off;
10720   if (adjust)
10721     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10722                 build_int_cst (TREE_TYPE (off), adjust));
10723
10724   t = fold_convert (sizetype, t);
10725   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10726
10727   if (is_ha)
10728     {
10729       /* type ha; // treat as "struct {ftype field[n];}"
10730          ... [computing offs]
10731          for (i = 0; i <nregs; ++i, offs += 16)
10732            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10733          return ha;  */
10734       int i;
10735       tree tmp_ha, field_t, field_ptr_t;
10736
10737       /* Declare a local variable.  */
10738       tmp_ha = create_tmp_var_raw (type, "ha");
10739       gimple_add_tmp_var (tmp_ha);
10740
10741       /* Establish the base type.  */
10742       switch (ag_mode)
10743         {
10744         case SFmode:
10745           field_t = float_type_node;
10746           field_ptr_t = float_ptr_type_node;
10747           break;
10748         case DFmode:
10749           field_t = double_type_node;
10750           field_ptr_t = double_ptr_type_node;
10751           break;
10752         case TFmode:
10753           field_t = long_double_type_node;
10754           field_ptr_t = long_double_ptr_type_node;
10755           break;
10756         case HFmode:
10757           field_t = aarch64_fp16_type_node;
10758           field_ptr_t = aarch64_fp16_ptr_type_node;
10759           break;
10760         case V2SImode:
10761         case V4SImode:
10762             {
10763               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10764               field_t = build_vector_type_for_mode (innertype, ag_mode);
10765               field_ptr_t = build_pointer_type (field_t);
10766             }
10767           break;
10768         default:
10769           gcc_assert (0);
10770         }
10771
10772       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10773       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10774       addr = t;
10775       t = fold_convert (field_ptr_t, addr);
10776       t = build2 (MODIFY_EXPR, field_t,
10777                   build1 (INDIRECT_REF, field_t, tmp_ha),
10778                   build1 (INDIRECT_REF, field_t, t));
10779
10780       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10781       for (i = 1; i < nregs; ++i)
10782         {
10783           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10784           u = fold_convert (field_ptr_t, addr);
10785           u = build2 (MODIFY_EXPR, field_t,
10786                       build2 (MEM_REF, field_t, tmp_ha,
10787                               build_int_cst (field_ptr_t,
10788                                              (i *
10789                                               int_size_in_bytes (field_t)))),
10790                       build1 (INDIRECT_REF, field_t, u));
10791           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10792         }
10793
10794       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10795       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10796     }
10797
10798   COND_EXPR_ELSE (cond2) = t;
10799   addr = fold_convert (build_pointer_type (type), cond1);
10800   addr = build_va_arg_indirect_ref (addr);
10801
10802   if (indirect_p)
10803     addr = build_va_arg_indirect_ref (addr);
10804
10805   return addr;
10806 }
10807
10808 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10809
10810 static void
10811 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10812                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10813                                 int no_rtl)
10814 {
10815   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10816   CUMULATIVE_ARGS local_cum;
10817   int gr_saved = cfun->va_list_gpr_size;
10818   int vr_saved = cfun->va_list_fpr_size;
10819
10820   /* The caller has advanced CUM up to, but not beyond, the last named
10821      argument.  Advance a local copy of CUM past the last "real" named
10822      argument, to find out how many registers are left over.  */
10823   local_cum = *cum;
10824   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10825
10826   /* Found out how many registers we need to save.
10827      Honor tree-stdvar analysis results.  */
10828   if (cfun->va_list_gpr_size)
10829     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10830                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10831   if (cfun->va_list_fpr_size)
10832     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10833                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10834
10835   if (!TARGET_FLOAT)
10836     {
10837       gcc_assert (local_cum.aapcs_nvrn == 0);
10838       vr_saved = 0;
10839     }
10840
10841   if (!no_rtl)
10842     {
10843       if (gr_saved > 0)
10844         {
10845           rtx ptr, mem;
10846
10847           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10848           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10849                                - gr_saved * UNITS_PER_WORD);
10850           mem = gen_frame_mem (BLKmode, ptr);
10851           set_mem_alias_set (mem, get_varargs_alias_set ());
10852
10853           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10854                                mem, gr_saved);
10855         }
10856       if (vr_saved > 0)
10857         {
10858           /* We can't use move_block_from_reg, because it will use
10859              the wrong mode, storing D regs only.  */
10860           machine_mode mode = TImode;
10861           int off, i, vr_start;
10862
10863           /* Set OFF to the offset from virtual_incoming_args_rtx of
10864              the first vector register.  The VR save area lies below
10865              the GR one, and is aligned to 16 bytes.  */
10866           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10867                            STACK_BOUNDARY / BITS_PER_UNIT);
10868           off -= vr_saved * UNITS_PER_VREG;
10869
10870           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10871           for (i = 0; i < vr_saved; ++i)
10872             {
10873               rtx ptr, mem;
10874
10875               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10876               mem = gen_frame_mem (mode, ptr);
10877               set_mem_alias_set (mem, get_varargs_alias_set ());
10878               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10879               off += UNITS_PER_VREG;
10880             }
10881         }
10882     }
10883
10884   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10885      any complication of having crtl->args.pretend_args_size changed.  */
10886   cfun->machine->frame.saved_varargs_size
10887     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10888                  STACK_BOUNDARY / BITS_PER_UNIT)
10889        + vr_saved * UNITS_PER_VREG);
10890 }
10891
10892 static void
10893 aarch64_conditional_register_usage (void)
10894 {
10895   int i;
10896   if (!TARGET_FLOAT)
10897     {
10898       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10899         {
10900           fixed_regs[i] = 1;
10901           call_used_regs[i] = 1;
10902         }
10903     }
10904 }
10905
10906 /* Walk down the type tree of TYPE counting consecutive base elements.
10907    If *MODEP is VOIDmode, then set it to the first valid floating point
10908    type.  If a non-floating point type is found, or if a floating point
10909    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10910    otherwise return the count in the sub-tree.  */
10911 static int
10912 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10913 {
10914   machine_mode mode;
10915   HOST_WIDE_INT size;
10916
10917   switch (TREE_CODE (type))
10918     {
10919     case REAL_TYPE:
10920       mode = TYPE_MODE (type);
10921       if (mode != DFmode && mode != SFmode
10922           && mode != TFmode && mode != HFmode)
10923         return -1;
10924
10925       if (*modep == VOIDmode)
10926         *modep = mode;
10927
10928       if (*modep == mode)
10929         return 1;
10930
10931       break;
10932
10933     case COMPLEX_TYPE:
10934       mode = TYPE_MODE (TREE_TYPE (type));
10935       if (mode != DFmode && mode != SFmode
10936           && mode != TFmode && mode != HFmode)
10937         return -1;
10938
10939       if (*modep == VOIDmode)
10940         *modep = mode;
10941
10942       if (*modep == mode)
10943         return 2;
10944
10945       break;
10946
10947     case VECTOR_TYPE:
10948       /* Use V2SImode and V4SImode as representatives of all 64-bit
10949          and 128-bit vector types.  */
10950       size = int_size_in_bytes (type);
10951       switch (size)
10952         {
10953         case 8:
10954           mode = V2SImode;
10955           break;
10956         case 16:
10957           mode = V4SImode;
10958           break;
10959         default:
10960           return -1;
10961         }
10962
10963       if (*modep == VOIDmode)
10964         *modep = mode;
10965
10966       /* Vector modes are considered to be opaque: two vectors are
10967          equivalent for the purposes of being homogeneous aggregates
10968          if they are the same size.  */
10969       if (*modep == mode)
10970         return 1;
10971
10972       break;
10973
10974     case ARRAY_TYPE:
10975       {
10976         int count;
10977         tree index = TYPE_DOMAIN (type);
10978
10979         /* Can't handle incomplete types nor sizes that are not
10980            fixed.  */
10981         if (!COMPLETE_TYPE_P (type)
10982             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10983           return -1;
10984
10985         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10986         if (count == -1
10987             || !index
10988             || !TYPE_MAX_VALUE (index)
10989             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10990             || !TYPE_MIN_VALUE (index)
10991             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10992             || count < 0)
10993           return -1;
10994
10995         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10996                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10997
10998         /* There must be no padding.  */
10999         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11000           return -1;
11001
11002         return count;
11003       }
11004
11005     case RECORD_TYPE:
11006       {
11007         int count = 0;
11008         int sub_count;
11009         tree field;
11010
11011         /* Can't handle incomplete types nor sizes that are not
11012            fixed.  */
11013         if (!COMPLETE_TYPE_P (type)
11014             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11015           return -1;
11016
11017         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11018           {
11019             if (TREE_CODE (field) != FIELD_DECL)
11020               continue;
11021
11022             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11023             if (sub_count < 0)
11024               return -1;
11025             count += sub_count;
11026           }
11027
11028         /* There must be no padding.  */
11029         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11030           return -1;
11031
11032         return count;
11033       }
11034
11035     case UNION_TYPE:
11036     case QUAL_UNION_TYPE:
11037       {
11038         /* These aren't very interesting except in a degenerate case.  */
11039         int count = 0;
11040         int sub_count;
11041         tree field;
11042
11043         /* Can't handle incomplete types nor sizes that are not
11044            fixed.  */
11045         if (!COMPLETE_TYPE_P (type)
11046             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11047           return -1;
11048
11049         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11050           {
11051             if (TREE_CODE (field) != FIELD_DECL)
11052               continue;
11053
11054             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11055             if (sub_count < 0)
11056               return -1;
11057             count = count > sub_count ? count : sub_count;
11058           }
11059
11060         /* There must be no padding.  */
11061         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11062           return -1;
11063
11064         return count;
11065       }
11066
11067     default:
11068       break;
11069     }
11070
11071   return -1;
11072 }
11073
11074 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11075    type as described in AAPCS64 \S 4.1.2.
11076
11077    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11078
11079 static bool
11080 aarch64_short_vector_p (const_tree type,
11081                         machine_mode mode)
11082 {
11083   HOST_WIDE_INT size = -1;
11084
11085   if (type && TREE_CODE (type) == VECTOR_TYPE)
11086     size = int_size_in_bytes (type);
11087   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11088             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11089     size = GET_MODE_SIZE (mode);
11090
11091   return (size == 8 || size == 16);
11092 }
11093
11094 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11095    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11096    array types.  The C99 floating-point complex types are also considered
11097    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11098    types, which are GCC extensions and out of the scope of AAPCS64, are
11099    treated as composite types here as well.
11100
11101    Note that MODE itself is not sufficient in determining whether a type
11102    is such a composite type or not.  This is because
11103    stor-layout.c:compute_record_mode may have already changed the MODE
11104    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11105    structure with only one field may have its MODE set to the mode of the
11106    field.  Also an integer mode whose size matches the size of the
11107    RECORD_TYPE type may be used to substitute the original mode
11108    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11109    solely relied on.  */
11110
11111 static bool
11112 aarch64_composite_type_p (const_tree type,
11113                           machine_mode mode)
11114 {
11115   if (aarch64_short_vector_p (type, mode))
11116     return false;
11117
11118   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11119     return true;
11120
11121   if (mode == BLKmode
11122       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11123       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11124     return true;
11125
11126   return false;
11127 }
11128
11129 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11130    shall be passed or returned in simd/fp register(s) (providing these
11131    parameter passing registers are available).
11132
11133    Upon successful return, *COUNT returns the number of needed registers,
11134    *BASE_MODE returns the mode of the individual register and when IS_HAF
11135    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11136    floating-point aggregate or a homogeneous short-vector aggregate.  */
11137
11138 static bool
11139 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11140                                          const_tree type,
11141                                          machine_mode *base_mode,
11142                                          int *count,
11143                                          bool *is_ha)
11144 {
11145   machine_mode new_mode = VOIDmode;
11146   bool composite_p = aarch64_composite_type_p (type, mode);
11147
11148   if (is_ha != NULL) *is_ha = false;
11149
11150   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11151       || aarch64_short_vector_p (type, mode))
11152     {
11153       *count = 1;
11154       new_mode = mode;
11155     }
11156   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11157     {
11158       if (is_ha != NULL) *is_ha = true;
11159       *count = 2;
11160       new_mode = GET_MODE_INNER (mode);
11161     }
11162   else if (type && composite_p)
11163     {
11164       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11165
11166       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11167         {
11168           if (is_ha != NULL) *is_ha = true;
11169           *count = ag_count;
11170         }
11171       else
11172         return false;
11173     }
11174   else
11175     return false;
11176
11177   *base_mode = new_mode;
11178   return true;
11179 }
11180
11181 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11182
11183 static rtx
11184 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11185                           int incoming ATTRIBUTE_UNUSED)
11186 {
11187   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11188 }
11189
11190 /* Implements target hook vector_mode_supported_p.  */
11191 static bool
11192 aarch64_vector_mode_supported_p (machine_mode mode)
11193 {
11194   if (TARGET_SIMD
11195       && (mode == V4SImode  || mode == V8HImode
11196           || mode == V16QImode || mode == V2DImode
11197           || mode == V2SImode  || mode == V4HImode
11198           || mode == V8QImode || mode == V2SFmode
11199           || mode == V4SFmode || mode == V2DFmode
11200           || mode == V4HFmode || mode == V8HFmode
11201           || mode == V1DFmode))
11202     return true;
11203
11204   return false;
11205 }
11206
11207 /* Return appropriate SIMD container
11208    for MODE within a vector of WIDTH bits.  */
11209 static machine_mode
11210 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11211 {
11212   gcc_assert (width == 64 || width == 128);
11213   if (TARGET_SIMD)
11214     {
11215       if (width == 128)
11216         switch (mode)
11217           {
11218           case DFmode:
11219             return V2DFmode;
11220           case SFmode:
11221             return V4SFmode;
11222           case HFmode:
11223             return V8HFmode;
11224           case SImode:
11225             return V4SImode;
11226           case HImode:
11227             return V8HImode;
11228           case QImode:
11229             return V16QImode;
11230           case DImode:
11231             return V2DImode;
11232           default:
11233             break;
11234           }
11235       else
11236         switch (mode)
11237           {
11238           case SFmode:
11239             return V2SFmode;
11240           case HFmode:
11241             return V4HFmode;
11242           case SImode:
11243             return V2SImode;
11244           case HImode:
11245             return V4HImode;
11246           case QImode:
11247             return V8QImode;
11248           default:
11249             break;
11250           }
11251     }
11252   return word_mode;
11253 }
11254
11255 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11256 static machine_mode
11257 aarch64_preferred_simd_mode (machine_mode mode)
11258 {
11259   return aarch64_simd_container_mode (mode, 128);
11260 }
11261
11262 /* Return the bitmask of possible vector sizes for the vectorizer
11263    to iterate over.  */
11264 static unsigned int
11265 aarch64_autovectorize_vector_sizes (void)
11266 {
11267   return (16 | 8);
11268 }
11269
11270 /* Implement TARGET_MANGLE_TYPE.  */
11271
11272 static const char *
11273 aarch64_mangle_type (const_tree type)
11274 {
11275   /* The AArch64 ABI documents say that "__va_list" has to be
11276      managled as if it is in the "std" namespace.  */
11277   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11278     return "St9__va_list";
11279
11280   /* Half-precision float.  */
11281   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11282     return "Dh";
11283
11284   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11285      builtin types.  */
11286   if (TYPE_NAME (type) != NULL)
11287     return aarch64_mangle_builtin_type (type);
11288
11289   /* Use the default mangling.  */
11290   return NULL;
11291 }
11292
11293 /* Find the first rtx_insn before insn that will generate an assembly
11294    instruction.  */
11295
11296 static rtx_insn *
11297 aarch64_prev_real_insn (rtx_insn *insn)
11298 {
11299   if (!insn)
11300     return NULL;
11301
11302   do
11303     {
11304       insn = prev_real_insn (insn);
11305     }
11306   while (insn && recog_memoized (insn) < 0);
11307
11308   return insn;
11309 }
11310
11311 static bool
11312 is_madd_op (enum attr_type t1)
11313 {
11314   unsigned int i;
11315   /* A number of these may be AArch32 only.  */
11316   enum attr_type mlatypes[] = {
11317     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11318     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11319     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11320   };
11321
11322   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11323     {
11324       if (t1 == mlatypes[i])
11325         return true;
11326     }
11327
11328   return false;
11329 }
11330
11331 /* Check if there is a register dependency between a load and the insn
11332    for which we hold recog_data.  */
11333
11334 static bool
11335 dep_between_memop_and_curr (rtx memop)
11336 {
11337   rtx load_reg;
11338   int opno;
11339
11340   gcc_assert (GET_CODE (memop) == SET);
11341
11342   if (!REG_P (SET_DEST (memop)))
11343     return false;
11344
11345   load_reg = SET_DEST (memop);
11346   for (opno = 1; opno < recog_data.n_operands; opno++)
11347     {
11348       rtx operand = recog_data.operand[opno];
11349       if (REG_P (operand)
11350           && reg_overlap_mentioned_p (load_reg, operand))
11351         return true;
11352
11353     }
11354   return false;
11355 }
11356
11357
11358 /* When working around the Cortex-A53 erratum 835769,
11359    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11360    instruction and has a preceding memory instruction such that a NOP
11361    should be inserted between them.  */
11362
11363 bool
11364 aarch64_madd_needs_nop (rtx_insn* insn)
11365 {
11366   enum attr_type attr_type;
11367   rtx_insn *prev;
11368   rtx body;
11369
11370   if (!TARGET_FIX_ERR_A53_835769)
11371     return false;
11372
11373   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11374     return false;
11375
11376   attr_type = get_attr_type (insn);
11377   if (!is_madd_op (attr_type))
11378     return false;
11379
11380   prev = aarch64_prev_real_insn (insn);
11381   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11382      Restore recog state to INSN to avoid state corruption.  */
11383   extract_constrain_insn_cached (insn);
11384
11385   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11386     return false;
11387
11388   body = single_set (prev);
11389
11390   /* If the previous insn is a memory op and there is no dependency between
11391      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11392      have a complex memory operation, probably a load/store pair.
11393      Be conservative for now and emit a NOP.  */
11394   if (GET_MODE (recog_data.operand[0]) == DImode
11395       && (!body || !dep_between_memop_and_curr (body)))
11396     return true;
11397
11398   return false;
11399
11400 }
11401
11402
11403 /* Implement FINAL_PRESCAN_INSN.  */
11404
11405 void
11406 aarch64_final_prescan_insn (rtx_insn *insn)
11407 {
11408   if (aarch64_madd_needs_nop (insn))
11409     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11410 }
11411
11412
11413 /* Return the equivalent letter for size.  */
11414 static char
11415 sizetochar (int size)
11416 {
11417   switch (size)
11418     {
11419     case 64: return 'd';
11420     case 32: return 's';
11421     case 16: return 'h';
11422     case 8 : return 'b';
11423     default: gcc_unreachable ();
11424     }
11425 }
11426
11427 /* Return true iff x is a uniform vector of floating-point
11428    constants, and the constant can be represented in
11429    quarter-precision form.  Note, as aarch64_float_const_representable
11430    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11431 static bool
11432 aarch64_vect_float_const_representable_p (rtx x)
11433 {
11434   rtx elt;
11435   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11436           && const_vec_duplicate_p (x, &elt)
11437           && aarch64_float_const_representable_p (elt));
11438 }
11439
11440 /* Return true for valid and false for invalid.  */
11441 bool
11442 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11443                               struct simd_immediate_info *info)
11444 {
11445 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11446   matches = 1;                                          \
11447   for (i = 0; i < idx; i += (STRIDE))                   \
11448     if (!(TEST))                                        \
11449       matches = 0;                                      \
11450   if (matches)                                          \
11451     {                                                   \
11452       immtype = (CLASS);                                \
11453       elsize = (ELSIZE);                                \
11454       eshift = (SHIFT);                                 \
11455       emvn = (NEG);                                     \
11456       break;                                            \
11457     }
11458
11459   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11460   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11461   unsigned char bytes[16];
11462   int immtype = -1, matches;
11463   unsigned int invmask = inverse ? 0xff : 0;
11464   int eshift, emvn;
11465
11466   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11467     {
11468       if (! (aarch64_simd_imm_zero_p (op, mode)
11469              || aarch64_vect_float_const_representable_p (op)))
11470         return false;
11471
11472       if (info)
11473         {
11474           info->value = CONST_VECTOR_ELT (op, 0);
11475           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11476           info->mvn = false;
11477           info->shift = 0;
11478         }
11479
11480       return true;
11481     }
11482
11483   /* Splat vector constant out into a byte vector.  */
11484   for (i = 0; i < n_elts; i++)
11485     {
11486       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11487          it must be laid out in the vector register in reverse order.  */
11488       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11489       unsigned HOST_WIDE_INT elpart;
11490
11491       gcc_assert (CONST_INT_P (el));
11492       elpart = INTVAL (el);
11493
11494       for (unsigned int byte = 0; byte < innersize; byte++)
11495         {
11496           bytes[idx++] = (elpart & 0xff) ^ invmask;
11497           elpart >>= BITS_PER_UNIT;
11498         }
11499
11500     }
11501
11502   /* Sanity check.  */
11503   gcc_assert (idx == GET_MODE_SIZE (mode));
11504
11505   do
11506     {
11507       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11508              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11509
11510       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11511              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11512
11513       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11514              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11515
11516       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11517              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11518
11519       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11520
11521       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11522
11523       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11524              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11525
11526       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11527              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11528
11529       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11530              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11531
11532       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11533              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11534
11535       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11536
11537       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11538
11539       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11540              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11541
11542       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11543              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11544
11545       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11546              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11547
11548       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11549              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11550
11551       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11552
11553       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11554              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11555     }
11556   while (0);
11557
11558   if (immtype == -1)
11559     return false;
11560
11561   if (info)
11562     {
11563       info->element_width = elsize;
11564       info->mvn = emvn != 0;
11565       info->shift = eshift;
11566
11567       unsigned HOST_WIDE_INT imm = 0;
11568
11569       if (immtype >= 12 && immtype <= 15)
11570         info->msl = true;
11571
11572       /* Un-invert bytes of recognized vector, if necessary.  */
11573       if (invmask != 0)
11574         for (i = 0; i < idx; i++)
11575           bytes[i] ^= invmask;
11576
11577       if (immtype == 17)
11578         {
11579           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11580           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11581
11582           for (i = 0; i < 8; i++)
11583             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11584               << (i * BITS_PER_UNIT);
11585
11586
11587           info->value = GEN_INT (imm);
11588         }
11589       else
11590         {
11591           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11592             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11593
11594           /* Construct 'abcdefgh' because the assembler cannot handle
11595              generic constants.  */
11596           if (info->mvn)
11597             imm = ~imm;
11598           imm = (imm >> info->shift) & 0xff;
11599           info->value = GEN_INT (imm);
11600         }
11601     }
11602
11603   return true;
11604 #undef CHECK
11605 }
11606
11607 /* Check of immediate shift constants are within range.  */
11608 bool
11609 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11610 {
11611   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11612   if (left)
11613     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11614   else
11615     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11616 }
11617
11618 /* Return true if X is a uniform vector where all elements
11619    are either the floating-point constant 0.0 or the
11620    integer constant 0.  */
11621 bool
11622 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11623 {
11624   return x == CONST0_RTX (mode);
11625 }
11626
11627
11628 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11629    operation of width WIDTH at bit position POS.  */
11630
11631 rtx
11632 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11633 {
11634   gcc_assert (CONST_INT_P (width));
11635   gcc_assert (CONST_INT_P (pos));
11636
11637   unsigned HOST_WIDE_INT mask
11638     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11639   return GEN_INT (mask << UINTVAL (pos));
11640 }
11641
11642 bool
11643 aarch64_mov_operand_p (rtx x, machine_mode mode)
11644 {
11645   if (GET_CODE (x) == HIGH
11646       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11647     return true;
11648
11649   if (CONST_INT_P (x))
11650     return true;
11651
11652   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11653     return true;
11654
11655   return aarch64_classify_symbolic_expression (x)
11656     == SYMBOL_TINY_ABSOLUTE;
11657 }
11658
11659 /* Return a const_int vector of VAL.  */
11660 rtx
11661 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11662 {
11663   int nunits = GET_MODE_NUNITS (mode);
11664   rtvec v = rtvec_alloc (nunits);
11665   int i;
11666
11667   rtx cache = GEN_INT (val);
11668
11669   for (i=0; i < nunits; i++)
11670     RTVEC_ELT (v, i) = cache;
11671
11672   return gen_rtx_CONST_VECTOR (mode, v);
11673 }
11674
11675 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11676
11677 bool
11678 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11679 {
11680   machine_mode vmode;
11681
11682   gcc_assert (!VECTOR_MODE_P (mode));
11683   vmode = aarch64_preferred_simd_mode (mode);
11684   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11685   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11686 }
11687
11688 /* Construct and return a PARALLEL RTX vector with elements numbering the
11689    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11690    the vector - from the perspective of the architecture.  This does not
11691    line up with GCC's perspective on lane numbers, so we end up with
11692    different masks depending on our target endian-ness.  The diagram
11693    below may help.  We must draw the distinction when building masks
11694    which select one half of the vector.  An instruction selecting
11695    architectural low-lanes for a big-endian target, must be described using
11696    a mask selecting GCC high-lanes.
11697
11698                  Big-Endian             Little-Endian
11699
11700 GCC             0   1   2   3           3   2   1   0
11701               | x | x | x | x |       | x | x | x | x |
11702 Architecture    3   2   1   0           3   2   1   0
11703
11704 Low Mask:         { 2, 3 }                { 0, 1 }
11705 High Mask:        { 0, 1 }                { 2, 3 }
11706 */
11707
11708 rtx
11709 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11710 {
11711   int nunits = GET_MODE_NUNITS (mode);
11712   rtvec v = rtvec_alloc (nunits / 2);
11713   int high_base = nunits / 2;
11714   int low_base = 0;
11715   int base;
11716   rtx t1;
11717   int i;
11718
11719   if (BYTES_BIG_ENDIAN)
11720     base = high ? low_base : high_base;
11721   else
11722     base = high ? high_base : low_base;
11723
11724   for (i = 0; i < nunits / 2; i++)
11725     RTVEC_ELT (v, i) = GEN_INT (base + i);
11726
11727   t1 = gen_rtx_PARALLEL (mode, v);
11728   return t1;
11729 }
11730
11731 /* Check OP for validity as a PARALLEL RTX vector with elements
11732    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11733    from the perspective of the architecture.  See the diagram above
11734    aarch64_simd_vect_par_cnst_half for more details.  */
11735
11736 bool
11737 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11738                                        bool high)
11739 {
11740   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11741   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11742   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11743   int i = 0;
11744
11745   if (!VECTOR_MODE_P (mode))
11746     return false;
11747
11748   if (count_op != count_ideal)
11749     return false;
11750
11751   for (i = 0; i < count_ideal; i++)
11752     {
11753       rtx elt_op = XVECEXP (op, 0, i);
11754       rtx elt_ideal = XVECEXP (ideal, 0, i);
11755
11756       if (!CONST_INT_P (elt_op)
11757           || INTVAL (elt_ideal) != INTVAL (elt_op))
11758         return false;
11759     }
11760   return true;
11761 }
11762
11763 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11764    HIGH (exclusive).  */
11765 void
11766 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11767                           const_tree exp)
11768 {
11769   HOST_WIDE_INT lane;
11770   gcc_assert (CONST_INT_P (operand));
11771   lane = INTVAL (operand);
11772
11773   if (lane < low || lane >= high)
11774   {
11775     if (exp)
11776       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11777     else
11778       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11779   }
11780 }
11781
11782 /* Return TRUE if OP is a valid vector addressing mode.  */
11783 bool
11784 aarch64_simd_mem_operand_p (rtx op)
11785 {
11786   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11787                         || REG_P (XEXP (op, 0)));
11788 }
11789
11790 /* Emit a register copy from operand to operand, taking care not to
11791    early-clobber source registers in the process.
11792
11793    COUNT is the number of components into which the copy needs to be
11794    decomposed.  */
11795 void
11796 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11797                                 unsigned int count)
11798 {
11799   unsigned int i;
11800   int rdest = REGNO (operands[0]);
11801   int rsrc = REGNO (operands[1]);
11802
11803   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11804       || rdest < rsrc)
11805     for (i = 0; i < count; i++)
11806       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11807                       gen_rtx_REG (mode, rsrc + i));
11808   else
11809     for (i = 0; i < count; i++)
11810       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11811                       gen_rtx_REG (mode, rsrc + count - i - 1));
11812 }
11813
11814 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11815    one of VSTRUCT modes: OI, CI, or XI.  */
11816 int
11817 aarch64_simd_attr_length_rglist (machine_mode mode)
11818 {
11819   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11820 }
11821
11822 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11823    alignment of a vector to 128 bits.  */
11824 static HOST_WIDE_INT
11825 aarch64_simd_vector_alignment (const_tree type)
11826 {
11827   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11828   return MIN (align, 128);
11829 }
11830
11831 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11832 static bool
11833 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11834 {
11835   if (is_packed)
11836     return false;
11837
11838   /* We guarantee alignment for vectors up to 128-bits.  */
11839   if (tree_int_cst_compare (TYPE_SIZE (type),
11840                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11841     return false;
11842
11843   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11844   return true;
11845 }
11846
11847 /* Return true if the vector misalignment factor is supported by the
11848    target.  */
11849 static bool
11850 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11851                                              const_tree type, int misalignment,
11852                                              bool is_packed)
11853 {
11854   if (TARGET_SIMD && STRICT_ALIGNMENT)
11855     {
11856       /* Return if movmisalign pattern is not supported for this mode.  */
11857       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11858         return false;
11859
11860       if (misalignment == -1)
11861         {
11862           /* Misalignment factor is unknown at compile time but we know
11863              it's word aligned.  */
11864           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11865             {
11866               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11867
11868               if (element_size != 64)
11869                 return true;
11870             }
11871           return false;
11872         }
11873     }
11874   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11875                                                       is_packed);
11876 }
11877
11878 /* If VALS is a vector constant that can be loaded into a register
11879    using DUP, generate instructions to do so and return an RTX to
11880    assign to the register.  Otherwise return NULL_RTX.  */
11881 static rtx
11882 aarch64_simd_dup_constant (rtx vals)
11883 {
11884   machine_mode mode = GET_MODE (vals);
11885   machine_mode inner_mode = GET_MODE_INNER (mode);
11886   rtx x;
11887
11888   if (!const_vec_duplicate_p (vals, &x))
11889     return NULL_RTX;
11890
11891   /* We can load this constant by using DUP and a constant in a
11892      single ARM register.  This will be cheaper than a vector
11893      load.  */
11894   x = copy_to_mode_reg (inner_mode, x);
11895   return gen_rtx_VEC_DUPLICATE (mode, x);
11896 }
11897
11898
11899 /* Generate code to load VALS, which is a PARALLEL containing only
11900    constants (for vec_init) or CONST_VECTOR, efficiently into a
11901    register.  Returns an RTX to copy into the register, or NULL_RTX
11902    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11903 static rtx
11904 aarch64_simd_make_constant (rtx vals)
11905 {
11906   machine_mode mode = GET_MODE (vals);
11907   rtx const_dup;
11908   rtx const_vec = NULL_RTX;
11909   int n_elts = GET_MODE_NUNITS (mode);
11910   int n_const = 0;
11911   int i;
11912
11913   if (GET_CODE (vals) == CONST_VECTOR)
11914     const_vec = vals;
11915   else if (GET_CODE (vals) == PARALLEL)
11916     {
11917       /* A CONST_VECTOR must contain only CONST_INTs and
11918          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11919          Only store valid constants in a CONST_VECTOR.  */
11920       for (i = 0; i < n_elts; ++i)
11921         {
11922           rtx x = XVECEXP (vals, 0, i);
11923           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11924             n_const++;
11925         }
11926       if (n_const == n_elts)
11927         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11928     }
11929   else
11930     gcc_unreachable ();
11931
11932   if (const_vec != NULL_RTX
11933       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11934     /* Load using MOVI/MVNI.  */
11935     return const_vec;
11936   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11937     /* Loaded using DUP.  */
11938     return const_dup;
11939   else if (const_vec != NULL_RTX)
11940     /* Load from constant pool. We can not take advantage of single-cycle
11941        LD1 because we need a PC-relative addressing mode.  */
11942     return const_vec;
11943   else
11944     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11945        We can not construct an initializer.  */
11946     return NULL_RTX;
11947 }
11948
11949 /* Expand a vector initialisation sequence, such that TARGET is
11950    initialised to contain VALS.  */
11951
11952 void
11953 aarch64_expand_vector_init (rtx target, rtx vals)
11954 {
11955   machine_mode mode = GET_MODE (target);
11956   machine_mode inner_mode = GET_MODE_INNER (mode);
11957   /* The number of vector elements.  */
11958   int n_elts = GET_MODE_NUNITS (mode);
11959   /* The number of vector elements which are not constant.  */
11960   int n_var = 0;
11961   rtx any_const = NULL_RTX;
11962   /* The first element of vals.  */
11963   rtx v0 = XVECEXP (vals, 0, 0);
11964   bool all_same = true;
11965
11966   /* Count the number of variable elements to initialise.  */
11967   for (int i = 0; i < n_elts; ++i)
11968     {
11969       rtx x = XVECEXP (vals, 0, i);
11970       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11971         ++n_var;
11972       else
11973         any_const = x;
11974
11975       all_same &= rtx_equal_p (x, v0);
11976     }
11977
11978   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11979      how best to handle this.  */
11980   if (n_var == 0)
11981     {
11982       rtx constant = aarch64_simd_make_constant (vals);
11983       if (constant != NULL_RTX)
11984         {
11985           emit_move_insn (target, constant);
11986           return;
11987         }
11988     }
11989
11990   /* Splat a single non-constant element if we can.  */
11991   if (all_same)
11992     {
11993       rtx x = copy_to_mode_reg (inner_mode, v0);
11994       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11995       return;
11996     }
11997
11998   enum insn_code icode = optab_handler (vec_set_optab, mode);
11999   gcc_assert (icode != CODE_FOR_nothing);
12000
12001   /* If there are only variable elements, try to optimize
12002      the insertion using dup for the most common element
12003      followed by insertions.  */
12004
12005   /* The algorithm will fill matches[*][0] with the earliest matching element,
12006      and matches[X][1] with the count of duplicate elements (if X is the
12007      earliest element which has duplicates).  */
12008
12009   if (n_var == n_elts && n_elts <= 16)
12010     {
12011       int matches[16][2] = {0};
12012       for (int i = 0; i < n_elts; i++)
12013         {
12014           for (int j = 0; j <= i; j++)
12015             {
12016               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12017                 {
12018                   matches[i][0] = j;
12019                   matches[j][1]++;
12020                   break;
12021                 }
12022             }
12023         }
12024       int maxelement = 0;
12025       int maxv = 0;
12026       for (int i = 0; i < n_elts; i++)
12027         if (matches[i][1] > maxv)
12028           {
12029             maxelement = i;
12030             maxv = matches[i][1];
12031           }
12032
12033       /* Create a duplicate of the most common element.  */
12034       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12035       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12036
12037       /* Insert the rest.  */
12038       for (int i = 0; i < n_elts; i++)
12039         {
12040           rtx x = XVECEXP (vals, 0, i);
12041           if (matches[i][0] == maxelement)
12042             continue;
12043           x = copy_to_mode_reg (inner_mode, x);
12044           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12045         }
12046       return;
12047     }
12048
12049   /* Initialise a vector which is part-variable.  We want to first try
12050      to build those lanes which are constant in the most efficient way we
12051      can.  */
12052   if (n_var != n_elts)
12053     {
12054       rtx copy = copy_rtx (vals);
12055
12056       /* Load constant part of vector.  We really don't care what goes into the
12057          parts we will overwrite, but we're more likely to be able to load the
12058          constant efficiently if it has fewer, larger, repeating parts
12059          (see aarch64_simd_valid_immediate).  */
12060       for (int i = 0; i < n_elts; i++)
12061         {
12062           rtx x = XVECEXP (vals, 0, i);
12063           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12064             continue;
12065           rtx subst = any_const;
12066           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12067             {
12068               /* Look in the copied vector, as more elements are const.  */
12069               rtx test = XVECEXP (copy, 0, i ^ bit);
12070               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12071                 {
12072                   subst = test;
12073                   break;
12074                 }
12075             }
12076           XVECEXP (copy, 0, i) = subst;
12077         }
12078       aarch64_expand_vector_init (target, copy);
12079     }
12080
12081   /* Insert the variable lanes directly.  */
12082   for (int i = 0; i < n_elts; i++)
12083     {
12084       rtx x = XVECEXP (vals, 0, i);
12085       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12086         continue;
12087       x = copy_to_mode_reg (inner_mode, x);
12088       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12089     }
12090 }
12091
12092 static unsigned HOST_WIDE_INT
12093 aarch64_shift_truncation_mask (machine_mode mode)
12094 {
12095   return
12096     (!SHIFT_COUNT_TRUNCATED
12097      || aarch64_vector_mode_supported_p (mode)
12098      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12099 }
12100
12101 /* Select a format to encode pointers in exception handling data.  */
12102 int
12103 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12104 {
12105    int type;
12106    switch (aarch64_cmodel)
12107      {
12108      case AARCH64_CMODEL_TINY:
12109      case AARCH64_CMODEL_TINY_PIC:
12110      case AARCH64_CMODEL_SMALL:
12111      case AARCH64_CMODEL_SMALL_PIC:
12112      case AARCH64_CMODEL_SMALL_SPIC:
12113        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12114           for everything.  */
12115        type = DW_EH_PE_sdata4;
12116        break;
12117      default:
12118        /* No assumptions here.  8-byte relocs required.  */
12119        type = DW_EH_PE_sdata8;
12120        break;
12121      }
12122    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12123 }
12124
12125 /* The last .arch and .tune assembly strings that we printed.  */
12126 static std::string aarch64_last_printed_arch_string;
12127 static std::string aarch64_last_printed_tune_string;
12128
12129 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12130    by the function fndecl.  */
12131
12132 void
12133 aarch64_declare_function_name (FILE *stream, const char* name,
12134                                 tree fndecl)
12135 {
12136   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12137
12138   struct cl_target_option *targ_options;
12139   if (target_parts)
12140     targ_options = TREE_TARGET_OPTION (target_parts);
12141   else
12142     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12143   gcc_assert (targ_options);
12144
12145   const struct processor *this_arch
12146     = aarch64_get_arch (targ_options->x_explicit_arch);
12147
12148   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12149   std::string extension
12150     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12151                                                   this_arch->flags);
12152   /* Only update the assembler .arch string if it is distinct from the last
12153      such string we printed.  */
12154   std::string to_print = this_arch->name + extension;
12155   if (to_print != aarch64_last_printed_arch_string)
12156     {
12157       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12158       aarch64_last_printed_arch_string = to_print;
12159     }
12160
12161   /* Print the cpu name we're tuning for in the comments, might be
12162      useful to readers of the generated asm.  Do it only when it changes
12163      from function to function and verbose assembly is requested.  */
12164   const struct processor *this_tune
12165     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12166
12167   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12168     {
12169       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12170                    this_tune->name);
12171       aarch64_last_printed_tune_string = this_tune->name;
12172     }
12173
12174   /* Don't forget the type directive for ELF.  */
12175   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12176   ASM_OUTPUT_LABEL (stream, name);
12177 }
12178
12179 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12180
12181 static void
12182 aarch64_start_file (void)
12183 {
12184   struct cl_target_option *default_options
12185     = TREE_TARGET_OPTION (target_option_default_node);
12186
12187   const struct processor *default_arch
12188     = aarch64_get_arch (default_options->x_explicit_arch);
12189   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12190   std::string extension
12191     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12192                                                   default_arch->flags);
12193
12194    aarch64_last_printed_arch_string = default_arch->name + extension;
12195    aarch64_last_printed_tune_string = "";
12196    asm_fprintf (asm_out_file, "\t.arch %s\n",
12197                 aarch64_last_printed_arch_string.c_str ());
12198
12199    default_file_start ();
12200 }
12201
12202 /* Emit load exclusive.  */
12203
12204 static void
12205 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12206                              rtx mem, rtx model_rtx)
12207 {
12208   rtx (*gen) (rtx, rtx, rtx);
12209
12210   switch (mode)
12211     {
12212     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
12213     case HImode: gen = gen_aarch64_load_exclusivehi; break;
12214     case SImode: gen = gen_aarch64_load_exclusivesi; break;
12215     case DImode: gen = gen_aarch64_load_exclusivedi; break;
12216     default:
12217       gcc_unreachable ();
12218     }
12219
12220   emit_insn (gen (rval, mem, model_rtx));
12221 }
12222
12223 /* Emit store exclusive.  */
12224
12225 static void
12226 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12227                               rtx rval, rtx mem, rtx model_rtx)
12228 {
12229   rtx (*gen) (rtx, rtx, rtx, rtx);
12230
12231   switch (mode)
12232     {
12233     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
12234     case HImode: gen = gen_aarch64_store_exclusivehi; break;
12235     case SImode: gen = gen_aarch64_store_exclusivesi; break;
12236     case DImode: gen = gen_aarch64_store_exclusivedi; break;
12237     default:
12238       gcc_unreachable ();
12239     }
12240
12241   emit_insn (gen (bval, rval, mem, model_rtx));
12242 }
12243
12244 /* Mark the previous jump instruction as unlikely.  */
12245
12246 static void
12247 aarch64_emit_unlikely_jump (rtx insn)
12248 {
12249   rtx_insn *jump = emit_jump_insn (insn);
12250   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12251 }
12252
12253 /* Expand a compare and swap pattern.  */
12254
12255 void
12256 aarch64_expand_compare_and_swap (rtx operands[])
12257 {
12258   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12259   machine_mode mode, cmp_mode;
12260   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12261   int idx;
12262   gen_cas_fn gen;
12263   const gen_cas_fn split_cas[] =
12264   {
12265     gen_aarch64_compare_and_swapqi,
12266     gen_aarch64_compare_and_swaphi,
12267     gen_aarch64_compare_and_swapsi,
12268     gen_aarch64_compare_and_swapdi
12269   };
12270   const gen_cas_fn atomic_cas[] =
12271   {
12272     gen_aarch64_compare_and_swapqi_lse,
12273     gen_aarch64_compare_and_swaphi_lse,
12274     gen_aarch64_compare_and_swapsi_lse,
12275     gen_aarch64_compare_and_swapdi_lse
12276   };
12277
12278   bval = operands[0];
12279   rval = operands[1];
12280   mem = operands[2];
12281   oldval = operands[3];
12282   newval = operands[4];
12283   is_weak = operands[5];
12284   mod_s = operands[6];
12285   mod_f = operands[7];
12286   mode = GET_MODE (mem);
12287   cmp_mode = mode;
12288
12289   /* Normally the succ memory model must be stronger than fail, but in the
12290      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12291      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12292
12293   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12294       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12295     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12296
12297   switch (mode)
12298     {
12299     case QImode:
12300     case HImode:
12301       /* For short modes, we're going to perform the comparison in SImode,
12302          so do the zero-extension now.  */
12303       cmp_mode = SImode;
12304       rval = gen_reg_rtx (SImode);
12305       oldval = convert_modes (SImode, mode, oldval, true);
12306       /* Fall through.  */
12307
12308     case SImode:
12309     case DImode:
12310       /* Force the value into a register if needed.  */
12311       if (!aarch64_plus_operand (oldval, mode))
12312         oldval = force_reg (cmp_mode, oldval);
12313       break;
12314
12315     default:
12316       gcc_unreachable ();
12317     }
12318
12319   switch (mode)
12320     {
12321     case QImode: idx = 0; break;
12322     case HImode: idx = 1; break;
12323     case SImode: idx = 2; break;
12324     case DImode: idx = 3; break;
12325     default:
12326       gcc_unreachable ();
12327     }
12328   if (TARGET_LSE)
12329     gen = atomic_cas[idx];
12330   else
12331     gen = split_cas[idx];
12332
12333   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12334
12335   if (mode == QImode || mode == HImode)
12336     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12337
12338   x = gen_rtx_REG (CCmode, CC_REGNUM);
12339   x = gen_rtx_EQ (SImode, x, const0_rtx);
12340   emit_insn (gen_rtx_SET (bval, x));
12341 }
12342
12343 /* Test whether the target supports using a atomic load-operate instruction.
12344    CODE is the operation and AFTER is TRUE if the data in memory after the
12345    operation should be returned and FALSE if the data before the operation
12346    should be returned.  Returns FALSE if the operation isn't supported by the
12347    architecture.  */
12348
12349 bool
12350 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12351 {
12352   if (!TARGET_LSE)
12353     return false;
12354
12355   switch (code)
12356     {
12357     case SET:
12358     case AND:
12359     case IOR:
12360     case XOR:
12361     case MINUS:
12362     case PLUS:
12363       return true;
12364     default:
12365       return false;
12366     }
12367 }
12368
12369 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12370    sequence implementing an atomic operation.  */
12371
12372 static void
12373 aarch64_emit_post_barrier (enum memmodel model)
12374 {
12375   const enum memmodel base_model = memmodel_base (model);
12376
12377   if (is_mm_sync (model)
12378       && (base_model == MEMMODEL_ACQUIRE
12379           || base_model == MEMMODEL_ACQ_REL
12380           || base_model == MEMMODEL_SEQ_CST))
12381     {
12382       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12383     }
12384 }
12385
12386 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12387    for the data in memory.  EXPECTED is the value expected to be in memory.
12388    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12389    is the memory ordering to use.  */
12390
12391 void
12392 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12393                         rtx expected, rtx desired,
12394                         rtx model)
12395 {
12396   rtx (*gen) (rtx, rtx, rtx, rtx);
12397   machine_mode mode;
12398
12399   mode = GET_MODE (mem);
12400
12401   switch (mode)
12402     {
12403     case QImode: gen = gen_aarch64_atomic_casqi; break;
12404     case HImode: gen = gen_aarch64_atomic_cashi; break;
12405     case SImode: gen = gen_aarch64_atomic_cassi; break;
12406     case DImode: gen = gen_aarch64_atomic_casdi; break;
12407     default:
12408       gcc_unreachable ();
12409     }
12410
12411   /* Move the expected value into the CAS destination register.  */
12412   emit_insn (gen_rtx_SET (rval, expected));
12413
12414   /* Emit the CAS.  */
12415   emit_insn (gen (rval, mem, desired, model));
12416
12417   /* Compare the expected value with the value loaded by the CAS, to establish
12418      whether the swap was made.  */
12419   aarch64_gen_compare_reg (EQ, rval, expected);
12420 }
12421
12422 /* Split a compare and swap pattern.  */
12423
12424 void
12425 aarch64_split_compare_and_swap (rtx operands[])
12426 {
12427   rtx rval, mem, oldval, newval, scratch;
12428   machine_mode mode;
12429   bool is_weak;
12430   rtx_code_label *label1, *label2;
12431   rtx x, cond;
12432   enum memmodel model;
12433   rtx model_rtx;
12434
12435   rval = operands[0];
12436   mem = operands[1];
12437   oldval = operands[2];
12438   newval = operands[3];
12439   is_weak = (operands[4] != const0_rtx);
12440   model_rtx = operands[5];
12441   scratch = operands[7];
12442   mode = GET_MODE (mem);
12443   model = memmodel_from_int (INTVAL (model_rtx));
12444
12445   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12446     loop:
12447     .label1:
12448         LD[A]XR rval, [mem]
12449         CBNZ    rval, .label2
12450         ST[L]XR scratch, newval, [mem]
12451         CBNZ    scratch, .label1
12452     .label2:
12453         CMP     rval, 0.  */
12454   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12455
12456   label1 = NULL;
12457   if (!is_weak)
12458     {
12459       label1 = gen_label_rtx ();
12460       emit_label (label1);
12461     }
12462   label2 = gen_label_rtx ();
12463
12464   /* The initial load can be relaxed for a __sync operation since a final
12465      barrier will be emitted to stop code hoisting.  */
12466   if (is_mm_sync (model))
12467     aarch64_emit_load_exclusive (mode, rval, mem,
12468                                  GEN_INT (MEMMODEL_RELAXED));
12469   else
12470     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12471
12472   if (strong_zero_p)
12473     {
12474       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12475       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12476                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12477       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12478     }
12479   else
12480     {
12481       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12482       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12483       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12484                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12485       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12486     }
12487
12488   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12489
12490   if (!is_weak)
12491     {
12492       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12493       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12494                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12495       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12496     }
12497   else
12498     {
12499       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12500       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12501       emit_insn (gen_rtx_SET (cond, x));
12502     }
12503
12504   emit_label (label2);
12505   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12506      to set the condition flags.  If this is not used it will be removed by
12507      later passes.  */
12508   if (strong_zero_p)
12509     {
12510       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12511       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12512       emit_insn (gen_rtx_SET (cond, x));
12513     }
12514   /* Emit any final barrier needed for a __sync operation.  */
12515   if (is_mm_sync (model))
12516     aarch64_emit_post_barrier (model);
12517 }
12518
12519 /* Emit a BIC instruction.  */
12520
12521 static void
12522 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12523 {
12524   rtx shift_rtx = GEN_INT (shift);
12525   rtx (*gen) (rtx, rtx, rtx, rtx);
12526
12527   switch (mode)
12528     {
12529     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12530     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12531     default:
12532       gcc_unreachable ();
12533     }
12534
12535   emit_insn (gen (dst, s2, shift_rtx, s1));
12536 }
12537
12538 /* Emit an atomic swap.  */
12539
12540 static void
12541 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12542                           rtx mem, rtx model)
12543 {
12544   rtx (*gen) (rtx, rtx, rtx, rtx);
12545
12546   switch (mode)
12547     {
12548     case QImode: gen = gen_aarch64_atomic_swpqi; break;
12549     case HImode: gen = gen_aarch64_atomic_swphi; break;
12550     case SImode: gen = gen_aarch64_atomic_swpsi; break;
12551     case DImode: gen = gen_aarch64_atomic_swpdi; break;
12552     default:
12553       gcc_unreachable ();
12554     }
12555
12556   emit_insn (gen (dst, mem, value, model));
12557 }
12558
12559 /* Operations supported by aarch64_emit_atomic_load_op.  */
12560
12561 enum aarch64_atomic_load_op_code
12562 {
12563   AARCH64_LDOP_PLUS,    /* A + B  */
12564   AARCH64_LDOP_XOR,     /* A ^ B  */
12565   AARCH64_LDOP_OR,      /* A | B  */
12566   AARCH64_LDOP_BIC      /* A & ~B  */
12567 };
12568
12569 /* Emit an atomic load-operate.  */
12570
12571 static void
12572 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12573                              machine_mode mode, rtx dst, rtx src,
12574                              rtx mem, rtx model)
12575 {
12576   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12577   const aarch64_atomic_load_op_fn plus[] =
12578   {
12579     gen_aarch64_atomic_loadaddqi,
12580     gen_aarch64_atomic_loadaddhi,
12581     gen_aarch64_atomic_loadaddsi,
12582     gen_aarch64_atomic_loadadddi
12583   };
12584   const aarch64_atomic_load_op_fn eor[] =
12585   {
12586     gen_aarch64_atomic_loadeorqi,
12587     gen_aarch64_atomic_loadeorhi,
12588     gen_aarch64_atomic_loadeorsi,
12589     gen_aarch64_atomic_loadeordi
12590   };
12591   const aarch64_atomic_load_op_fn ior[] =
12592   {
12593     gen_aarch64_atomic_loadsetqi,
12594     gen_aarch64_atomic_loadsethi,
12595     gen_aarch64_atomic_loadsetsi,
12596     gen_aarch64_atomic_loadsetdi
12597   };
12598   const aarch64_atomic_load_op_fn bic[] =
12599   {
12600     gen_aarch64_atomic_loadclrqi,
12601     gen_aarch64_atomic_loadclrhi,
12602     gen_aarch64_atomic_loadclrsi,
12603     gen_aarch64_atomic_loadclrdi
12604   };
12605   aarch64_atomic_load_op_fn gen;
12606   int idx = 0;
12607
12608   switch (mode)
12609     {
12610     case QImode: idx = 0; break;
12611     case HImode: idx = 1; break;
12612     case SImode: idx = 2; break;
12613     case DImode: idx = 3; break;
12614     default:
12615       gcc_unreachable ();
12616     }
12617
12618   switch (code)
12619     {
12620     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12621     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12622     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12623     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12624     default:
12625       gcc_unreachable ();
12626     }
12627
12628   emit_insn (gen (dst, mem, src, model));
12629 }
12630
12631 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12632    location to store the data read from memory.  OUT_RESULT is the location to
12633    store the result of the operation.  MEM is the memory location to read and
12634    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12635    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12636    be NULL.  */
12637
12638 void
12639 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12640                          rtx mem, rtx value, rtx model_rtx)
12641 {
12642   machine_mode mode = GET_MODE (mem);
12643   machine_mode wmode = (mode == DImode ? DImode : SImode);
12644   const bool short_mode = (mode < SImode);
12645   aarch64_atomic_load_op_code ldop_code;
12646   rtx src;
12647   rtx x;
12648
12649   if (out_data)
12650     out_data = gen_lowpart (mode, out_data);
12651
12652   if (out_result)
12653     out_result = gen_lowpart (mode, out_result);
12654
12655   /* Make sure the value is in a register, putting it into a destination
12656      register if it needs to be manipulated.  */
12657   if (!register_operand (value, mode)
12658       || code == AND || code == MINUS)
12659     {
12660       src = out_result ? out_result : out_data;
12661       emit_move_insn (src, gen_lowpart (mode, value));
12662     }
12663   else
12664     src = value;
12665   gcc_assert (register_operand (src, mode));
12666
12667   /* Preprocess the data for the operation as necessary.  If the operation is
12668      a SET then emit a swap instruction and finish.  */
12669   switch (code)
12670     {
12671     case SET:
12672       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12673       return;
12674
12675     case MINUS:
12676       /* Negate the value and treat it as a PLUS.  */
12677       {
12678         rtx neg_src;
12679
12680         /* Resize the value if necessary.  */
12681         if (short_mode)
12682           src = gen_lowpart (wmode, src);
12683
12684         neg_src = gen_rtx_NEG (wmode, src);
12685         emit_insn (gen_rtx_SET (src, neg_src));
12686
12687         if (short_mode)
12688           src = gen_lowpart (mode, src);
12689       }
12690       /* Fall-through.  */
12691     case PLUS:
12692       ldop_code = AARCH64_LDOP_PLUS;
12693       break;
12694
12695     case IOR:
12696       ldop_code = AARCH64_LDOP_OR;
12697       break;
12698
12699     case XOR:
12700       ldop_code = AARCH64_LDOP_XOR;
12701       break;
12702
12703     case AND:
12704       {
12705         rtx not_src;
12706
12707         /* Resize the value if necessary.  */
12708         if (short_mode)
12709           src = gen_lowpart (wmode, src);
12710
12711         not_src = gen_rtx_NOT (wmode, src);
12712         emit_insn (gen_rtx_SET (src, not_src));
12713
12714         if (short_mode)
12715           src = gen_lowpart (mode, src);
12716       }
12717       ldop_code = AARCH64_LDOP_BIC;
12718       break;
12719
12720     default:
12721       /* The operation can't be done with atomic instructions.  */
12722       gcc_unreachable ();
12723     }
12724
12725   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12726
12727   /* If necessary, calculate the data in memory after the update by redoing the
12728      operation from values in registers.  */
12729   if (!out_result)
12730     return;
12731
12732   if (short_mode)
12733     {
12734       src = gen_lowpart (wmode, src);
12735       out_data = gen_lowpart (wmode, out_data);
12736       out_result = gen_lowpart (wmode, out_result);
12737     }
12738
12739   x = NULL_RTX;
12740
12741   switch (code)
12742     {
12743     case MINUS:
12744     case PLUS:
12745       x = gen_rtx_PLUS (wmode, out_data, src);
12746       break;
12747     case IOR:
12748       x = gen_rtx_IOR (wmode, out_data, src);
12749       break;
12750     case XOR:
12751       x = gen_rtx_XOR (wmode, out_data, src);
12752       break;
12753     case AND:
12754       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12755       return;
12756     default:
12757       gcc_unreachable ();
12758     }
12759
12760   emit_set_insn (out_result, x);
12761
12762   return;
12763 }
12764
12765 /* Split an atomic operation.  */
12766
12767 void
12768 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12769                          rtx value, rtx model_rtx, rtx cond)
12770 {
12771   machine_mode mode = GET_MODE (mem);
12772   machine_mode wmode = (mode == DImode ? DImode : SImode);
12773   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12774   const bool is_sync = is_mm_sync (model);
12775   rtx_code_label *label;
12776   rtx x;
12777
12778   /* Split the atomic operation into a sequence.  */
12779   label = gen_label_rtx ();
12780   emit_label (label);
12781
12782   if (new_out)
12783     new_out = gen_lowpart (wmode, new_out);
12784   if (old_out)
12785     old_out = gen_lowpart (wmode, old_out);
12786   else
12787     old_out = new_out;
12788   value = simplify_gen_subreg (wmode, value, mode, 0);
12789
12790   /* The initial load can be relaxed for a __sync operation since a final
12791      barrier will be emitted to stop code hoisting.  */
12792  if (is_sync)
12793     aarch64_emit_load_exclusive (mode, old_out, mem,
12794                                  GEN_INT (MEMMODEL_RELAXED));
12795   else
12796     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12797
12798   switch (code)
12799     {
12800     case SET:
12801       new_out = value;
12802       break;
12803
12804     case NOT:
12805       x = gen_rtx_AND (wmode, old_out, value);
12806       emit_insn (gen_rtx_SET (new_out, x));
12807       x = gen_rtx_NOT (wmode, new_out);
12808       emit_insn (gen_rtx_SET (new_out, x));
12809       break;
12810
12811     case MINUS:
12812       if (CONST_INT_P (value))
12813         {
12814           value = GEN_INT (-INTVAL (value));
12815           code = PLUS;
12816         }
12817       /* Fall through.  */
12818
12819     default:
12820       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12821       emit_insn (gen_rtx_SET (new_out, x));
12822       break;
12823     }
12824
12825   aarch64_emit_store_exclusive (mode, cond, mem,
12826                                 gen_lowpart (mode, new_out), model_rtx);
12827
12828   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12829   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12830                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12831   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12832
12833   /* Emit any final barrier needed for a __sync operation.  */
12834   if (is_sync)
12835     aarch64_emit_post_barrier (model);
12836 }
12837
12838 static void
12839 aarch64_init_libfuncs (void)
12840 {
12841    /* Half-precision float operations.  The compiler handles all operations
12842      with NULL libfuncs by converting to SFmode.  */
12843
12844   /* Conversions.  */
12845   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12846   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12847
12848   /* Arithmetic.  */
12849   set_optab_libfunc (add_optab, HFmode, NULL);
12850   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12851   set_optab_libfunc (smul_optab, HFmode, NULL);
12852   set_optab_libfunc (neg_optab, HFmode, NULL);
12853   set_optab_libfunc (sub_optab, HFmode, NULL);
12854
12855   /* Comparisons.  */
12856   set_optab_libfunc (eq_optab, HFmode, NULL);
12857   set_optab_libfunc (ne_optab, HFmode, NULL);
12858   set_optab_libfunc (lt_optab, HFmode, NULL);
12859   set_optab_libfunc (le_optab, HFmode, NULL);
12860   set_optab_libfunc (ge_optab, HFmode, NULL);
12861   set_optab_libfunc (gt_optab, HFmode, NULL);
12862   set_optab_libfunc (unord_optab, HFmode, NULL);
12863 }
12864
12865 /* Target hook for c_mode_for_suffix.  */
12866 static machine_mode
12867 aarch64_c_mode_for_suffix (char suffix)
12868 {
12869   if (suffix == 'q')
12870     return TFmode;
12871
12872   return VOIDmode;
12873 }
12874
12875 /* We can only represent floating point constants which will fit in
12876    "quarter-precision" values.  These values are characterised by
12877    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12878    by:
12879
12880    (-1)^s * (n/16) * 2^r
12881
12882    Where:
12883      's' is the sign bit.
12884      'n' is an integer in the range 16 <= n <= 31.
12885      'r' is an integer in the range -3 <= r <= 4.  */
12886
12887 /* Return true iff X can be represented by a quarter-precision
12888    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12889 bool
12890 aarch64_float_const_representable_p (rtx x)
12891 {
12892   /* This represents our current view of how many bits
12893      make up the mantissa.  */
12894   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12895   int exponent;
12896   unsigned HOST_WIDE_INT mantissa, mask;
12897   REAL_VALUE_TYPE r, m;
12898   bool fail;
12899
12900   if (!CONST_DOUBLE_P (x))
12901     return false;
12902
12903   /* We don't support HFmode constants yet.  */
12904   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12905     return false;
12906
12907   r = *CONST_DOUBLE_REAL_VALUE (x);
12908
12909   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12910      know if we have +zero until we analyse the mantissa, but we
12911      can reject the other invalid values.  */
12912   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12913       || REAL_VALUE_MINUS_ZERO (r))
12914     return false;
12915
12916   /* Extract exponent.  */
12917   r = real_value_abs (&r);
12918   exponent = REAL_EXP (&r);
12919
12920   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12921      highest (sign) bit, with a fixed binary point at bit point_pos.
12922      m1 holds the low part of the mantissa, m2 the high part.
12923      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12924      bits for the mantissa, this can fail (low bits will be lost).  */
12925   real_ldexp (&m, &r, point_pos - exponent);
12926   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12927
12928   /* If the low part of the mantissa has bits set we cannot represent
12929      the value.  */
12930   if (w.ulow () != 0)
12931     return false;
12932   /* We have rejected the lower HOST_WIDE_INT, so update our
12933      understanding of how many bits lie in the mantissa and
12934      look only at the high HOST_WIDE_INT.  */
12935   mantissa = w.elt (1);
12936   point_pos -= HOST_BITS_PER_WIDE_INT;
12937
12938   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12939   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12940   if ((mantissa & mask) != 0)
12941     return false;
12942
12943   /* Having filtered unrepresentable values, we may now remove all
12944      but the highest 5 bits.  */
12945   mantissa >>= point_pos - 5;
12946
12947   /* We cannot represent the value 0.0, so reject it.  This is handled
12948      elsewhere.  */
12949   if (mantissa == 0)
12950     return false;
12951
12952   /* Then, as bit 4 is always set, we can mask it off, leaving
12953      the mantissa in the range [0, 15].  */
12954   mantissa &= ~(1 << 4);
12955   gcc_assert (mantissa <= 15);
12956
12957   /* GCC internally does not use IEEE754-like encoding (where normalized
12958      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12959      Our mantissa values are shifted 4 places to the left relative to
12960      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12961      by 5 places to correct for GCC's representation.  */
12962   exponent = 5 - exponent;
12963
12964   return (exponent >= 0 && exponent <= 7);
12965 }
12966
12967 char*
12968 aarch64_output_simd_mov_immediate (rtx const_vector,
12969                                    machine_mode mode,
12970                                    unsigned width)
12971 {
12972   bool is_valid;
12973   static char templ[40];
12974   const char *mnemonic;
12975   const char *shift_op;
12976   unsigned int lane_count = 0;
12977   char element_char;
12978
12979   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12980
12981   /* This will return true to show const_vector is legal for use as either
12982      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12983      also update INFO to show how the immediate should be generated.  */
12984   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12985   gcc_assert (is_valid);
12986
12987   element_char = sizetochar (info.element_width);
12988   lane_count = width / info.element_width;
12989
12990   mode = GET_MODE_INNER (mode);
12991   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12992     {
12993       gcc_assert (info.shift == 0 && ! info.mvn);
12994       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12995          move immediate path.  */
12996       if (aarch64_float_const_zero_rtx_p (info.value))
12997         info.value = GEN_INT (0);
12998       else
12999         {
13000           const unsigned int buf_size = 20;
13001           char float_buf[buf_size] = {'\0'};
13002           real_to_decimal_for_mode (float_buf,
13003                                     CONST_DOUBLE_REAL_VALUE (info.value),
13004                                     buf_size, buf_size, 1, mode);
13005
13006           if (lane_count == 1)
13007             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13008           else
13009             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13010                       lane_count, element_char, float_buf);
13011           return templ;
13012         }
13013     }
13014
13015   mnemonic = info.mvn ? "mvni" : "movi";
13016   shift_op = info.msl ? "msl" : "lsl";
13017
13018   gcc_assert (CONST_INT_P (info.value));
13019   if (lane_count == 1)
13020     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13021               mnemonic, UINTVAL (info.value));
13022   else if (info.shift)
13023     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13024               ", %s %d", mnemonic, lane_count, element_char,
13025               UINTVAL (info.value), shift_op, info.shift);
13026   else
13027     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13028               mnemonic, lane_count, element_char, UINTVAL (info.value));
13029   return templ;
13030 }
13031
13032 char*
13033 aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
13034 {
13035
13036   /* If a floating point number was passed and we desire to use it in an
13037      integer mode do the conversion to integer.  */
13038   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13039     {
13040       unsigned HOST_WIDE_INT ival;
13041       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13042           gcc_unreachable ();
13043       immediate = gen_int_mode (ival, mode);
13044     }
13045
13046   machine_mode vmode;
13047   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13048      a 128 bit vector mode.  */
13049   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13050
13051   gcc_assert (!VECTOR_MODE_P (mode));
13052   vmode = aarch64_simd_container_mode (mode, width);
13053   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13054   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13055 }
13056
13057 /* Split operands into moves from op[1] + op[2] into op[0].  */
13058
13059 void
13060 aarch64_split_combinev16qi (rtx operands[3])
13061 {
13062   unsigned int dest = REGNO (operands[0]);
13063   unsigned int src1 = REGNO (operands[1]);
13064   unsigned int src2 = REGNO (operands[2]);
13065   machine_mode halfmode = GET_MODE (operands[1]);
13066   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13067   rtx destlo, desthi;
13068
13069   gcc_assert (halfmode == V16QImode);
13070
13071   if (src1 == dest && src2 == dest + halfregs)
13072     {
13073       /* No-op move.  Can't split to nothing; emit something.  */
13074       emit_note (NOTE_INSN_DELETED);
13075       return;
13076     }
13077
13078   /* Preserve register attributes for variable tracking.  */
13079   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13080   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13081                                GET_MODE_SIZE (halfmode));
13082
13083   /* Special case of reversed high/low parts.  */
13084   if (reg_overlap_mentioned_p (operands[2], destlo)
13085       && reg_overlap_mentioned_p (operands[1], desthi))
13086     {
13087       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13088       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13089       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13090     }
13091   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13092     {
13093       /* Try to avoid unnecessary moves if part of the result
13094          is in the right place already.  */
13095       if (src1 != dest)
13096         emit_move_insn (destlo, operands[1]);
13097       if (src2 != dest + halfregs)
13098         emit_move_insn (desthi, operands[2]);
13099     }
13100   else
13101     {
13102       if (src2 != dest + halfregs)
13103         emit_move_insn (desthi, operands[2]);
13104       if (src1 != dest)
13105         emit_move_insn (destlo, operands[1]);
13106     }
13107 }
13108
13109 /* vec_perm support.  */
13110
13111 #define MAX_VECT_LEN 16
13112
13113 struct expand_vec_perm_d
13114 {
13115   rtx target, op0, op1;
13116   unsigned char perm[MAX_VECT_LEN];
13117   machine_mode vmode;
13118   unsigned char nelt;
13119   bool one_vector_p;
13120   bool testing_p;
13121 };
13122
13123 /* Generate a variable permutation.  */
13124
13125 static void
13126 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13127 {
13128   machine_mode vmode = GET_MODE (target);
13129   bool one_vector_p = rtx_equal_p (op0, op1);
13130
13131   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13132   gcc_checking_assert (GET_MODE (op0) == vmode);
13133   gcc_checking_assert (GET_MODE (op1) == vmode);
13134   gcc_checking_assert (GET_MODE (sel) == vmode);
13135   gcc_checking_assert (TARGET_SIMD);
13136
13137   if (one_vector_p)
13138     {
13139       if (vmode == V8QImode)
13140         {
13141           /* Expand the argument to a V16QI mode by duplicating it.  */
13142           rtx pair = gen_reg_rtx (V16QImode);
13143           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13144           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13145         }
13146       else
13147         {
13148           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13149         }
13150     }
13151   else
13152     {
13153       rtx pair;
13154
13155       if (vmode == V8QImode)
13156         {
13157           pair = gen_reg_rtx (V16QImode);
13158           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13159           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13160         }
13161       else
13162         {
13163           pair = gen_reg_rtx (OImode);
13164           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13165           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13166         }
13167     }
13168 }
13169
13170 void
13171 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13172 {
13173   machine_mode vmode = GET_MODE (target);
13174   unsigned int nelt = GET_MODE_NUNITS (vmode);
13175   bool one_vector_p = rtx_equal_p (op0, op1);
13176   rtx mask;
13177
13178   /* The TBL instruction does not use a modulo index, so we must take care
13179      of that ourselves.  */
13180   mask = aarch64_simd_gen_const_vector_dup (vmode,
13181       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13182   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13183
13184   /* For big-endian, we also need to reverse the index within the vector
13185      (but not which vector).  */
13186   if (BYTES_BIG_ENDIAN)
13187     {
13188       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13189       if (!one_vector_p)
13190         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13191       sel = expand_simple_binop (vmode, XOR, sel, mask,
13192                                  NULL, 0, OPTAB_LIB_WIDEN);
13193     }
13194   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13195 }
13196
13197 /* Recognize patterns suitable for the TRN instructions.  */
13198 static bool
13199 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13200 {
13201   unsigned int i, odd, mask, nelt = d->nelt;
13202   rtx out, in0, in1, x;
13203   rtx (*gen) (rtx, rtx, rtx);
13204   machine_mode vmode = d->vmode;
13205
13206   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13207     return false;
13208
13209   /* Note that these are little-endian tests.
13210      We correct for big-endian later.  */
13211   if (d->perm[0] == 0)
13212     odd = 0;
13213   else if (d->perm[0] == 1)
13214     odd = 1;
13215   else
13216     return false;
13217   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13218
13219   for (i = 0; i < nelt; i += 2)
13220     {
13221       if (d->perm[i] != i + odd)
13222         return false;
13223       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13224         return false;
13225     }
13226
13227   /* Success!  */
13228   if (d->testing_p)
13229     return true;
13230
13231   in0 = d->op0;
13232   in1 = d->op1;
13233   if (BYTES_BIG_ENDIAN)
13234     {
13235       x = in0, in0 = in1, in1 = x;
13236       odd = !odd;
13237     }
13238   out = d->target;
13239
13240   if (odd)
13241     {
13242       switch (vmode)
13243         {
13244         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
13245         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
13246         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
13247         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
13248         case V4SImode: gen = gen_aarch64_trn2v4si; break;
13249         case V2SImode: gen = gen_aarch64_trn2v2si; break;
13250         case V2DImode: gen = gen_aarch64_trn2v2di; break;
13251         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13252         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13253         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13254         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13255         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
13256         default:
13257           return false;
13258         }
13259     }
13260   else
13261     {
13262       switch (vmode)
13263         {
13264         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
13265         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
13266         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
13267         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
13268         case V4SImode: gen = gen_aarch64_trn1v4si; break;
13269         case V2SImode: gen = gen_aarch64_trn1v2si; break;
13270         case V2DImode: gen = gen_aarch64_trn1v2di; break;
13271         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13272         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13273         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13274         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13275         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
13276         default:
13277           return false;
13278         }
13279     }
13280
13281   emit_insn (gen (out, in0, in1));
13282   return true;
13283 }
13284
13285 /* Recognize patterns suitable for the UZP instructions.  */
13286 static bool
13287 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13288 {
13289   unsigned int i, odd, mask, nelt = d->nelt;
13290   rtx out, in0, in1, x;
13291   rtx (*gen) (rtx, rtx, rtx);
13292   machine_mode vmode = d->vmode;
13293
13294   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13295     return false;
13296
13297   /* Note that these are little-endian tests.
13298      We correct for big-endian later.  */
13299   if (d->perm[0] == 0)
13300     odd = 0;
13301   else if (d->perm[0] == 1)
13302     odd = 1;
13303   else
13304     return false;
13305   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13306
13307   for (i = 0; i < nelt; i++)
13308     {
13309       unsigned elt = (i * 2 + odd) & mask;
13310       if (d->perm[i] != elt)
13311         return false;
13312     }
13313
13314   /* Success!  */
13315   if (d->testing_p)
13316     return true;
13317
13318   in0 = d->op0;
13319   in1 = d->op1;
13320   if (BYTES_BIG_ENDIAN)
13321     {
13322       x = in0, in0 = in1, in1 = x;
13323       odd = !odd;
13324     }
13325   out = d->target;
13326
13327   if (odd)
13328     {
13329       switch (vmode)
13330         {
13331         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13332         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13333         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13334         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13335         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
13336         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
13337         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
13338         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13339         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13340         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13341         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13342         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13343         default:
13344           return false;
13345         }
13346     }
13347   else
13348     {
13349       switch (vmode)
13350         {
13351         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13352         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13353         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13354         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13355         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
13356         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
13357         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
13358         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13359         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13360         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13361         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13362         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13363         default:
13364           return false;
13365         }
13366     }
13367
13368   emit_insn (gen (out, in0, in1));
13369   return true;
13370 }
13371
13372 /* Recognize patterns suitable for the ZIP instructions.  */
13373 static bool
13374 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13375 {
13376   unsigned int i, high, mask, nelt = d->nelt;
13377   rtx out, in0, in1, x;
13378   rtx (*gen) (rtx, rtx, rtx);
13379   machine_mode vmode = d->vmode;
13380
13381   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13382     return false;
13383
13384   /* Note that these are little-endian tests.
13385      We correct for big-endian later.  */
13386   high = nelt / 2;
13387   if (d->perm[0] == high)
13388     /* Do Nothing.  */
13389     ;
13390   else if (d->perm[0] == 0)
13391     high = 0;
13392   else
13393     return false;
13394   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13395
13396   for (i = 0; i < nelt / 2; i++)
13397     {
13398       unsigned elt = (i + high) & mask;
13399       if (d->perm[i * 2] != elt)
13400         return false;
13401       elt = (elt + nelt) & mask;
13402       if (d->perm[i * 2 + 1] != elt)
13403         return false;
13404     }
13405
13406   /* Success!  */
13407   if (d->testing_p)
13408     return true;
13409
13410   in0 = d->op0;
13411   in1 = d->op1;
13412   if (BYTES_BIG_ENDIAN)
13413     {
13414       x = in0, in0 = in1, in1 = x;
13415       high = !high;
13416     }
13417   out = d->target;
13418
13419   if (high)
13420     {
13421       switch (vmode)
13422         {
13423         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
13424         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
13425         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
13426         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
13427         case V4SImode: gen = gen_aarch64_zip2v4si; break;
13428         case V2SImode: gen = gen_aarch64_zip2v2si; break;
13429         case V2DImode: gen = gen_aarch64_zip2v2di; break;
13430         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13431         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13432         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13433         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13434         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
13435         default:
13436           return false;
13437         }
13438     }
13439   else
13440     {
13441       switch (vmode)
13442         {
13443         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13444         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13445         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13446         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13447         case V4SImode: gen = gen_aarch64_zip1v4si; break;
13448         case V2SImode: gen = gen_aarch64_zip1v2si; break;
13449         case V2DImode: gen = gen_aarch64_zip1v2di; break;
13450         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13451         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13452         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13453         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13454         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13455         default:
13456           return false;
13457         }
13458     }
13459
13460   emit_insn (gen (out, in0, in1));
13461   return true;
13462 }
13463
13464 /* Recognize patterns for the EXT insn.  */
13465
13466 static bool
13467 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13468 {
13469   unsigned int i, nelt = d->nelt;
13470   rtx (*gen) (rtx, rtx, rtx, rtx);
13471   rtx offset;
13472
13473   unsigned int location = d->perm[0]; /* Always < nelt.  */
13474
13475   /* Check if the extracted indices are increasing by one.  */
13476   for (i = 1; i < nelt; i++)
13477     {
13478       unsigned int required = location + i;
13479       if (d->one_vector_p)
13480         {
13481           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13482           required &= (nelt - 1);
13483         }
13484       if (d->perm[i] != required)
13485         return false;
13486     }
13487
13488   switch (d->vmode)
13489     {
13490     case V16QImode: gen = gen_aarch64_extv16qi; break;
13491     case V8QImode: gen = gen_aarch64_extv8qi; break;
13492     case V4HImode: gen = gen_aarch64_extv4hi; break;
13493     case V8HImode: gen = gen_aarch64_extv8hi; break;
13494     case V2SImode: gen = gen_aarch64_extv2si; break;
13495     case V4SImode: gen = gen_aarch64_extv4si; break;
13496     case V4HFmode: gen = gen_aarch64_extv4hf; break;
13497     case V8HFmode: gen = gen_aarch64_extv8hf; break;
13498     case V2SFmode: gen = gen_aarch64_extv2sf; break;
13499     case V4SFmode: gen = gen_aarch64_extv4sf; break;
13500     case V2DImode: gen = gen_aarch64_extv2di; break;
13501     case V2DFmode: gen = gen_aarch64_extv2df; break;
13502     default:
13503       return false;
13504     }
13505
13506   /* Success! */
13507   if (d->testing_p)
13508     return true;
13509
13510   /* The case where (location == 0) is a no-op for both big- and little-endian,
13511      and is removed by the mid-end at optimization levels -O1 and higher.  */
13512
13513   if (BYTES_BIG_ENDIAN && (location != 0))
13514     {
13515       /* After setup, we want the high elements of the first vector (stored
13516          at the LSB end of the register), and the low elements of the second
13517          vector (stored at the MSB end of the register). So swap.  */
13518       std::swap (d->op0, d->op1);
13519       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13520       location = nelt - location;
13521     }
13522
13523   offset = GEN_INT (location);
13524   emit_insn (gen (d->target, d->op0, d->op1, offset));
13525   return true;
13526 }
13527
13528 /* Recognize patterns for the REV insns.  */
13529
13530 static bool
13531 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13532 {
13533   unsigned int i, j, diff, nelt = d->nelt;
13534   rtx (*gen) (rtx, rtx);
13535
13536   if (!d->one_vector_p)
13537     return false;
13538
13539   diff = d->perm[0];
13540   switch (diff)
13541     {
13542     case 7:
13543       switch (d->vmode)
13544         {
13545         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13546         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13547         default:
13548           return false;
13549         }
13550       break;
13551     case 3:
13552       switch (d->vmode)
13553         {
13554         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13555         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13556         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13557         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13558         default:
13559           return false;
13560         }
13561       break;
13562     case 1:
13563       switch (d->vmode)
13564         {
13565         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13566         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13567         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13568         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13569         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
13570         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
13571         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13572         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13573         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13574         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13575         default:
13576           return false;
13577         }
13578       break;
13579     default:
13580       return false;
13581     }
13582
13583   for (i = 0; i < nelt ; i += diff + 1)
13584     for (j = 0; j <= diff; j += 1)
13585       {
13586         /* This is guaranteed to be true as the value of diff
13587            is 7, 3, 1 and we should have enough elements in the
13588            queue to generate this.  Getting a vector mask with a
13589            value of diff other than these values implies that
13590            something is wrong by the time we get here.  */
13591         gcc_assert (i + j < nelt);
13592         if (d->perm[i + j] != i + diff - j)
13593           return false;
13594       }
13595
13596   /* Success! */
13597   if (d->testing_p)
13598     return true;
13599
13600   emit_insn (gen (d->target, d->op0));
13601   return true;
13602 }
13603
13604 static bool
13605 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13606 {
13607   rtx (*gen) (rtx, rtx, rtx);
13608   rtx out = d->target;
13609   rtx in0;
13610   machine_mode vmode = d->vmode;
13611   unsigned int i, elt, nelt = d->nelt;
13612   rtx lane;
13613
13614   elt = d->perm[0];
13615   for (i = 1; i < nelt; i++)
13616     {
13617       if (elt != d->perm[i])
13618         return false;
13619     }
13620
13621   /* The generic preparation in aarch64_expand_vec_perm_const_1
13622      swaps the operand order and the permute indices if it finds
13623      d->perm[0] to be in the second operand.  Thus, we can always
13624      use d->op0 and need not do any extra arithmetic to get the
13625      correct lane number.  */
13626   in0 = d->op0;
13627   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13628
13629   switch (vmode)
13630     {
13631     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13632     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13633     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13634     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13635     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13636     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13637     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13638     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13639     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13640     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13641     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13642     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13643     default:
13644       return false;
13645     }
13646
13647   emit_insn (gen (out, in0, lane));
13648   return true;
13649 }
13650
13651 static bool
13652 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13653 {
13654   rtx rperm[MAX_VECT_LEN], sel;
13655   machine_mode vmode = d->vmode;
13656   unsigned int i, nelt = d->nelt;
13657
13658   if (d->testing_p)
13659     return true;
13660
13661   /* Generic code will try constant permutation twice.  Once with the
13662      original mode and again with the elements lowered to QImode.
13663      So wait and don't do the selector expansion ourselves.  */
13664   if (vmode != V8QImode && vmode != V16QImode)
13665     return false;
13666
13667   for (i = 0; i < nelt; ++i)
13668     {
13669       int nunits = GET_MODE_NUNITS (vmode);
13670
13671       /* If big-endian and two vectors we end up with a weird mixed-endian
13672          mode on NEON.  Reverse the index within each word but not the word
13673          itself.  */
13674       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13675                                            : d->perm[i]);
13676     }
13677   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13678   sel = force_reg (vmode, sel);
13679
13680   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13681   return true;
13682 }
13683
13684 static bool
13685 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13686 {
13687   /* The pattern matching functions above are written to look for a small
13688      number to begin the sequence (0, 1, N/2).  If we begin with an index
13689      from the second operand, we can swap the operands.  */
13690   if (d->perm[0] >= d->nelt)
13691     {
13692       unsigned i, nelt = d->nelt;
13693
13694       gcc_assert (nelt == (nelt & -nelt));
13695       for (i = 0; i < nelt; ++i)
13696         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13697
13698       std::swap (d->op0, d->op1);
13699     }
13700
13701   if (TARGET_SIMD)
13702     {
13703       if (aarch64_evpc_rev (d))
13704         return true;
13705       else if (aarch64_evpc_ext (d))
13706         return true;
13707       else if (aarch64_evpc_dup (d))
13708         return true;
13709       else if (aarch64_evpc_zip (d))
13710         return true;
13711       else if (aarch64_evpc_uzp (d))
13712         return true;
13713       else if (aarch64_evpc_trn (d))
13714         return true;
13715       return aarch64_evpc_tbl (d);
13716     }
13717   return false;
13718 }
13719
13720 /* Expand a vec_perm_const pattern.  */
13721
13722 bool
13723 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13724 {
13725   struct expand_vec_perm_d d;
13726   int i, nelt, which;
13727
13728   d.target = target;
13729   d.op0 = op0;
13730   d.op1 = op1;
13731
13732   d.vmode = GET_MODE (target);
13733   gcc_assert (VECTOR_MODE_P (d.vmode));
13734   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13735   d.testing_p = false;
13736
13737   for (i = which = 0; i < nelt; ++i)
13738     {
13739       rtx e = XVECEXP (sel, 0, i);
13740       int ei = INTVAL (e) & (2 * nelt - 1);
13741       which |= (ei < nelt ? 1 : 2);
13742       d.perm[i] = ei;
13743     }
13744
13745   switch (which)
13746     {
13747     default:
13748       gcc_unreachable ();
13749
13750     case 3:
13751       d.one_vector_p = false;
13752       if (!rtx_equal_p (op0, op1))
13753         break;
13754
13755       /* The elements of PERM do not suggest that only the first operand
13756          is used, but both operands are identical.  Allow easier matching
13757          of the permutation by folding the permutation into the single
13758          input vector.  */
13759       /* Fall Through.  */
13760     case 2:
13761       for (i = 0; i < nelt; ++i)
13762         d.perm[i] &= nelt - 1;
13763       d.op0 = op1;
13764       d.one_vector_p = true;
13765       break;
13766
13767     case 1:
13768       d.op1 = op0;
13769       d.one_vector_p = true;
13770       break;
13771     }
13772
13773   return aarch64_expand_vec_perm_const_1 (&d);
13774 }
13775
13776 static bool
13777 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13778                                      const unsigned char *sel)
13779 {
13780   struct expand_vec_perm_d d;
13781   unsigned int i, nelt, which;
13782   bool ret;
13783
13784   d.vmode = vmode;
13785   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13786   d.testing_p = true;
13787   memcpy (d.perm, sel, nelt);
13788
13789   /* Calculate whether all elements are in one vector.  */
13790   for (i = which = 0; i < nelt; ++i)
13791     {
13792       unsigned char e = d.perm[i];
13793       gcc_assert (e < 2 * nelt);
13794       which |= (e < nelt ? 1 : 2);
13795     }
13796
13797   /* If all elements are from the second vector, reindex as if from the
13798      first vector.  */
13799   if (which == 2)
13800     for (i = 0; i < nelt; ++i)
13801       d.perm[i] -= nelt;
13802
13803   /* Check whether the mask can be applied to a single vector.  */
13804   d.one_vector_p = (which != 3);
13805
13806   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13807   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13808   if (!d.one_vector_p)
13809     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13810
13811   start_sequence ();
13812   ret = aarch64_expand_vec_perm_const_1 (&d);
13813   end_sequence ();
13814
13815   return ret;
13816 }
13817
13818 rtx
13819 aarch64_reverse_mask (machine_mode mode)
13820 {
13821   /* We have to reverse each vector because we dont have
13822      a permuted load that can reverse-load according to ABI rules.  */
13823   rtx mask;
13824   rtvec v = rtvec_alloc (16);
13825   int i, j;
13826   int nunits = GET_MODE_NUNITS (mode);
13827   int usize = GET_MODE_UNIT_SIZE (mode);
13828
13829   gcc_assert (BYTES_BIG_ENDIAN);
13830   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13831
13832   for (i = 0; i < nunits; i++)
13833     for (j = 0; j < usize; j++)
13834       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13835   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13836   return force_reg (V16QImode, mask);
13837 }
13838
13839 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13840    However due to issues with register allocation it is preferable to avoid
13841    tieing integer scalar and FP scalar modes.  Executing integer operations
13842    in general registers is better than treating them as scalar vector
13843    operations.  This reduces latency and avoids redundant int<->FP moves.
13844    So tie modes if they are either the same class, or vector modes with
13845    other vector modes, vector structs or any scalar mode.
13846 */
13847
13848 bool
13849 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13850 {
13851   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13852     return true;
13853
13854   /* We specifically want to allow elements of "structure" modes to
13855      be tieable to the structure.  This more general condition allows
13856      other rarer situations too.  */
13857   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13858     return true;
13859
13860   /* Also allow any scalar modes with vectors.  */
13861   if (aarch64_vector_mode_supported_p (mode1)
13862       || aarch64_vector_mode_supported_p (mode2))
13863     return true;
13864
13865   return false;
13866 }
13867
13868 /* Return a new RTX holding the result of moving POINTER forward by
13869    AMOUNT bytes.  */
13870
13871 static rtx
13872 aarch64_move_pointer (rtx pointer, int amount)
13873 {
13874   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13875
13876   return adjust_automodify_address (pointer, GET_MODE (pointer),
13877                                     next, amount);
13878 }
13879
13880 /* Return a new RTX holding the result of moving POINTER forward by the
13881    size of the mode it points to.  */
13882
13883 static rtx
13884 aarch64_progress_pointer (rtx pointer)
13885 {
13886   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13887
13888   return aarch64_move_pointer (pointer, amount);
13889 }
13890
13891 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13892    MODE bytes.  */
13893
13894 static void
13895 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13896                                               machine_mode mode)
13897 {
13898   rtx reg = gen_reg_rtx (mode);
13899
13900   /* "Cast" the pointers to the correct mode.  */
13901   *src = adjust_address (*src, mode, 0);
13902   *dst = adjust_address (*dst, mode, 0);
13903   /* Emit the memcpy.  */
13904   emit_move_insn (reg, *src);
13905   emit_move_insn (*dst, reg);
13906   /* Move the pointers forward.  */
13907   *src = aarch64_progress_pointer (*src);
13908   *dst = aarch64_progress_pointer (*dst);
13909 }
13910
13911 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13912    we succeed, otherwise return false.  */
13913
13914 bool
13915 aarch64_expand_movmem (rtx *operands)
13916 {
13917   unsigned int n;
13918   rtx dst = operands[0];
13919   rtx src = operands[1];
13920   rtx base;
13921   bool speed_p = !optimize_function_for_size_p (cfun);
13922
13923   /* When optimizing for size, give a better estimate of the length of a
13924      memcpy call, but use the default otherwise.  */
13925   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13926
13927   /* We can't do anything smart if the amount to copy is not constant.  */
13928   if (!CONST_INT_P (operands[2]))
13929     return false;
13930
13931   n = UINTVAL (operands[2]);
13932
13933   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13934      need to make at most two moves.  For cases above 16 bytes it will be one
13935      move for each 16 byte chunk, then at most two additional moves.  */
13936   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13937     return false;
13938
13939   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13940   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13941
13942   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13943   src = adjust_automodify_address (src, VOIDmode, base, 0);
13944
13945   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13946      1-byte chunk.  */
13947   if (n < 4)
13948     {
13949       if (n >= 2)
13950         {
13951           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13952           n -= 2;
13953         }
13954
13955       if (n == 1)
13956         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13957
13958       return true;
13959     }
13960
13961   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13962      4-byte chunk, partially overlapping with the previously copied chunk.  */
13963   if (n < 8)
13964     {
13965       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13966       n -= 4;
13967       if (n > 0)
13968         {
13969           int move = n - 4;
13970
13971           src = aarch64_move_pointer (src, move);
13972           dst = aarch64_move_pointer (dst, move);
13973           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13974         }
13975       return true;
13976     }
13977
13978   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13979      them, then (if applicable) an 8-byte chunk.  */
13980   while (n >= 8)
13981     {
13982       if (n / 16)
13983         {
13984           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13985           n -= 16;
13986         }
13987       else
13988         {
13989           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13990           n -= 8;
13991         }
13992     }
13993
13994   /* Finish the final bytes of the copy.  We can always do this in one
13995      instruction.  We either copy the exact amount we need, or partially
13996      overlap with the previous chunk we copied and copy 8-bytes.  */
13997   if (n == 0)
13998     return true;
13999   else if (n == 1)
14000     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14001   else if (n == 2)
14002     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14003   else if (n == 4)
14004     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14005   else
14006     {
14007       if (n == 3)
14008         {
14009           src = aarch64_move_pointer (src, -1);
14010           dst = aarch64_move_pointer (dst, -1);
14011           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14012         }
14013       else
14014         {
14015           int move = n - 8;
14016
14017           src = aarch64_move_pointer (src, move);
14018           dst = aarch64_move_pointer (dst, move);
14019           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14020         }
14021     }
14022
14023   return true;
14024 }
14025
14026 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14027    SImode stores.  Handle the case when the constant has identical
14028    bottom and top halves.  This is beneficial when the two stores can be
14029    merged into an STP and we avoid synthesising potentially expensive
14030    immediates twice.  Return true if such a split is possible.  */
14031
14032 bool
14033 aarch64_split_dimode_const_store (rtx dst, rtx src)
14034 {
14035   rtx lo = gen_lowpart (SImode, src);
14036   rtx hi = gen_highpart_mode (SImode, DImode, src);
14037
14038   bool size_p = optimize_function_for_size_p (cfun);
14039
14040   if (!rtx_equal_p (lo, hi))
14041     return false;
14042
14043   unsigned int orig_cost
14044     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14045   unsigned int lo_cost
14046     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14047
14048   /* We want to transform:
14049      MOV        x1, 49370
14050      MOVK       x1, 0x140, lsl 16
14051      MOVK       x1, 0xc0da, lsl 32
14052      MOVK       x1, 0x140, lsl 48
14053      STR        x1, [x0]
14054    into:
14055      MOV        w1, 49370
14056      MOVK       w1, 0x140, lsl 16
14057      STP        w1, w1, [x0]
14058    So we want to perform this only when we save two instructions
14059    or more.  When optimizing for size, however, accept any code size
14060    savings we can.  */
14061   if (size_p && orig_cost <= lo_cost)
14062     return false;
14063
14064   if (!size_p
14065       && (orig_cost <= lo_cost + 1))
14066     return false;
14067
14068   rtx mem_lo = adjust_address (dst, SImode, 0);
14069   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14070     return false;
14071
14072   rtx tmp_reg = gen_reg_rtx (SImode);
14073   aarch64_expand_mov_immediate (tmp_reg, lo);
14074   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14075   /* Don't emit an explicit store pair as this may not be always profitable.
14076      Let the sched-fusion logic decide whether to merge them.  */
14077   emit_move_insn (mem_lo, tmp_reg);
14078   emit_move_insn (mem_hi, tmp_reg);
14079
14080   return true;
14081 }
14082
14083 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14084
14085 static unsigned HOST_WIDE_INT
14086 aarch64_asan_shadow_offset (void)
14087 {
14088   return (HOST_WIDE_INT_1 << 36);
14089 }
14090
14091 static bool
14092 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14093                                         unsigned int align,
14094                                         enum by_pieces_operation op,
14095                                         bool speed_p)
14096 {
14097   /* STORE_BY_PIECES can be used when copying a constant string, but
14098      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14099      For now we always fail this and let the move_by_pieces code copy
14100      the string from read-only memory.  */
14101   if (op == STORE_BY_PIECES)
14102     return false;
14103
14104   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14105 }
14106
14107 static rtx
14108 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14109                         int code, tree treeop0, tree treeop1)
14110 {
14111   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14112   rtx op0, op1;
14113   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14114   insn_code icode;
14115   struct expand_operand ops[4];
14116
14117   start_sequence ();
14118   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14119
14120   op_mode = GET_MODE (op0);
14121   if (op_mode == VOIDmode)
14122     op_mode = GET_MODE (op1);
14123
14124   switch (op_mode)
14125     {
14126     case QImode:
14127     case HImode:
14128     case SImode:
14129       cmp_mode = SImode;
14130       icode = CODE_FOR_cmpsi;
14131       break;
14132
14133     case DImode:
14134       cmp_mode = DImode;
14135       icode = CODE_FOR_cmpdi;
14136       break;
14137
14138     case SFmode:
14139       cmp_mode = SFmode;
14140       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14141       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14142       break;
14143
14144     case DFmode:
14145       cmp_mode = DFmode;
14146       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14147       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14148       break;
14149
14150     default:
14151       end_sequence ();
14152       return NULL_RTX;
14153     }
14154
14155   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14156   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14157   if (!op0 || !op1)
14158     {
14159       end_sequence ();
14160       return NULL_RTX;
14161     }
14162   *prep_seq = get_insns ();
14163   end_sequence ();
14164
14165   create_fixed_operand (&ops[0], op0);
14166   create_fixed_operand (&ops[1], op1);
14167
14168   start_sequence ();
14169   if (!maybe_expand_insn (icode, 2, ops))
14170     {
14171       end_sequence ();
14172       return NULL_RTX;
14173     }
14174   *gen_seq = get_insns ();
14175   end_sequence ();
14176
14177   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14178                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14179 }
14180
14181 static rtx
14182 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14183                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14184 {
14185   rtx op0, op1, target;
14186   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14187   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14188   insn_code icode;
14189   struct expand_operand ops[6];
14190   int aarch64_cond;
14191
14192   push_to_sequence (*prep_seq);
14193   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14194
14195   op_mode = GET_MODE (op0);
14196   if (op_mode == VOIDmode)
14197     op_mode = GET_MODE (op1);
14198
14199   switch (op_mode)
14200     {
14201     case QImode:
14202     case HImode:
14203     case SImode:
14204       cmp_mode = SImode;
14205       icode = CODE_FOR_ccmpsi;
14206       break;
14207
14208     case DImode:
14209       cmp_mode = DImode;
14210       icode = CODE_FOR_ccmpdi;
14211       break;
14212
14213     case SFmode:
14214       cmp_mode = SFmode;
14215       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14216       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14217       break;
14218
14219     case DFmode:
14220       cmp_mode = DFmode;
14221       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14222       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14223       break;
14224
14225     default:
14226       end_sequence ();
14227       return NULL_RTX;
14228     }
14229
14230   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14231   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14232   if (!op0 || !op1)
14233     {
14234       end_sequence ();
14235       return NULL_RTX;
14236     }
14237   *prep_seq = get_insns ();
14238   end_sequence ();
14239
14240   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14241   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14242
14243   if (bit_code != AND)
14244     {
14245       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14246                                                 GET_MODE (XEXP (prev, 0))),
14247                              VOIDmode, XEXP (prev, 0), const0_rtx);
14248       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14249     }
14250
14251   create_fixed_operand (&ops[0], XEXP (prev, 0));
14252   create_fixed_operand (&ops[1], target);
14253   create_fixed_operand (&ops[2], op0);
14254   create_fixed_operand (&ops[3], op1);
14255   create_fixed_operand (&ops[4], prev);
14256   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14257
14258   push_to_sequence (*gen_seq);
14259   if (!maybe_expand_insn (icode, 6, ops))
14260     {
14261       end_sequence ();
14262       return NULL_RTX;
14263     }
14264
14265   *gen_seq = get_insns ();
14266   end_sequence ();
14267
14268   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14269 }
14270
14271 #undef TARGET_GEN_CCMP_FIRST
14272 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14273
14274 #undef TARGET_GEN_CCMP_NEXT
14275 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14276
14277 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14278    instruction fusion of some sort.  */
14279
14280 static bool
14281 aarch64_macro_fusion_p (void)
14282 {
14283   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14284 }
14285
14286
14287 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14288    should be kept together during scheduling.  */
14289
14290 static bool
14291 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14292 {
14293   rtx set_dest;
14294   rtx prev_set = single_set (prev);
14295   rtx curr_set = single_set (curr);
14296   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14297   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14298
14299   if (!aarch64_macro_fusion_p ())
14300     return false;
14301
14302   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14303     {
14304       /* We are trying to match:
14305          prev (mov)  == (set (reg r0) (const_int imm16))
14306          curr (movk) == (set (zero_extract (reg r0)
14307                                            (const_int 16)
14308                                            (const_int 16))
14309                              (const_int imm16_1))  */
14310
14311       set_dest = SET_DEST (curr_set);
14312
14313       if (GET_CODE (set_dest) == ZERO_EXTRACT
14314           && CONST_INT_P (SET_SRC (curr_set))
14315           && CONST_INT_P (SET_SRC (prev_set))
14316           && CONST_INT_P (XEXP (set_dest, 2))
14317           && INTVAL (XEXP (set_dest, 2)) == 16
14318           && REG_P (XEXP (set_dest, 0))
14319           && REG_P (SET_DEST (prev_set))
14320           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14321         {
14322           return true;
14323         }
14324     }
14325
14326   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14327     {
14328
14329       /*  We're trying to match:
14330           prev (adrp) == (set (reg r1)
14331                               (high (symbol_ref ("SYM"))))
14332           curr (add) == (set (reg r0)
14333                              (lo_sum (reg r1)
14334                                      (symbol_ref ("SYM"))))
14335           Note that r0 need not necessarily be the same as r1, especially
14336           during pre-regalloc scheduling.  */
14337
14338       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14339           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14340         {
14341           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14342               && REG_P (XEXP (SET_SRC (curr_set), 0))
14343               && REGNO (XEXP (SET_SRC (curr_set), 0))
14344                  == REGNO (SET_DEST (prev_set))
14345               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14346                               XEXP (SET_SRC (curr_set), 1)))
14347             return true;
14348         }
14349     }
14350
14351   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14352     {
14353
14354       /* We're trying to match:
14355          prev (movk) == (set (zero_extract (reg r0)
14356                                            (const_int 16)
14357                                            (const_int 32))
14358                              (const_int imm16_1))
14359          curr (movk) == (set (zero_extract (reg r0)
14360                                            (const_int 16)
14361                                            (const_int 48))
14362                              (const_int imm16_2))  */
14363
14364       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14365           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14366           && REG_P (XEXP (SET_DEST (prev_set), 0))
14367           && REG_P (XEXP (SET_DEST (curr_set), 0))
14368           && REGNO (XEXP (SET_DEST (prev_set), 0))
14369              == REGNO (XEXP (SET_DEST (curr_set), 0))
14370           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14371           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14372           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14373           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14374           && CONST_INT_P (SET_SRC (prev_set))
14375           && CONST_INT_P (SET_SRC (curr_set)))
14376         return true;
14377
14378     }
14379   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14380     {
14381       /* We're trying to match:
14382           prev (adrp) == (set (reg r0)
14383                               (high (symbol_ref ("SYM"))))
14384           curr (ldr) == (set (reg r1)
14385                              (mem (lo_sum (reg r0)
14386                                              (symbol_ref ("SYM")))))
14387                  or
14388           curr (ldr) == (set (reg r1)
14389                              (zero_extend (mem
14390                                            (lo_sum (reg r0)
14391                                                    (symbol_ref ("SYM"))))))  */
14392       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14393           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14394         {
14395           rtx curr_src = SET_SRC (curr_set);
14396
14397           if (GET_CODE (curr_src) == ZERO_EXTEND)
14398             curr_src = XEXP (curr_src, 0);
14399
14400           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14401               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14402               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14403                  == REGNO (SET_DEST (prev_set))
14404               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14405                               XEXP (SET_SRC (prev_set), 0)))
14406               return true;
14407         }
14408     }
14409
14410   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14411        && aarch_crypto_can_dual_issue (prev, curr))
14412     return true;
14413
14414   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14415       && any_condjump_p (curr))
14416     {
14417       enum attr_type prev_type = get_attr_type (prev);
14418
14419       unsigned int condreg1, condreg2;
14420       rtx cc_reg_1;
14421       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14422       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14423
14424       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14425           && prev
14426           && modified_in_p (cc_reg_1, prev))
14427         {
14428           /* FIXME: this misses some which is considered simple arthematic
14429              instructions for ThunderX.  Simple shifts are missed here.  */
14430           if (prev_type == TYPE_ALUS_SREG
14431               || prev_type == TYPE_ALUS_IMM
14432               || prev_type == TYPE_LOGICS_REG
14433               || prev_type == TYPE_LOGICS_IMM)
14434             return true;
14435         }
14436     }
14437
14438   if (prev_set
14439       && curr_set
14440       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14441       && any_condjump_p (curr))
14442     {
14443       /* We're trying to match:
14444           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14445           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14446                                                          (const_int 0))
14447                                                  (label_ref ("SYM"))
14448                                                  (pc))  */
14449       if (SET_DEST (curr_set) == (pc_rtx)
14450           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14451           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14452           && REG_P (SET_DEST (prev_set))
14453           && REGNO (SET_DEST (prev_set))
14454              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14455         {
14456           /* Fuse ALU operations followed by conditional branch instruction.  */
14457           switch (get_attr_type (prev))
14458             {
14459             case TYPE_ALU_IMM:
14460             case TYPE_ALU_SREG:
14461             case TYPE_ADC_REG:
14462             case TYPE_ADC_IMM:
14463             case TYPE_ADCS_REG:
14464             case TYPE_ADCS_IMM:
14465             case TYPE_LOGIC_REG:
14466             case TYPE_LOGIC_IMM:
14467             case TYPE_CSEL:
14468             case TYPE_ADR:
14469             case TYPE_MOV_IMM:
14470             case TYPE_SHIFT_REG:
14471             case TYPE_SHIFT_IMM:
14472             case TYPE_BFM:
14473             case TYPE_RBIT:
14474             case TYPE_REV:
14475             case TYPE_EXTEND:
14476               return true;
14477
14478             default:;
14479             }
14480         }
14481     }
14482
14483   return false;
14484 }
14485
14486 /* Return true iff the instruction fusion described by OP is enabled.  */
14487
14488 bool
14489 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14490 {
14491   return (aarch64_tune_params.fusible_ops & op) != 0;
14492 }
14493
14494 /* If MEM is in the form of [base+offset], extract the two parts
14495    of address and set to BASE and OFFSET, otherwise return false
14496    after clearing BASE and OFFSET.  */
14497
14498 bool
14499 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14500 {
14501   rtx addr;
14502
14503   gcc_assert (MEM_P (mem));
14504
14505   addr = XEXP (mem, 0);
14506
14507   if (REG_P (addr))
14508     {
14509       *base = addr;
14510       *offset = const0_rtx;
14511       return true;
14512     }
14513
14514   if (GET_CODE (addr) == PLUS
14515       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14516     {
14517       *base = XEXP (addr, 0);
14518       *offset = XEXP (addr, 1);
14519       return true;
14520     }
14521
14522   *base = NULL_RTX;
14523   *offset = NULL_RTX;
14524
14525   return false;
14526 }
14527
14528 /* Types for scheduling fusion.  */
14529 enum sched_fusion_type
14530 {
14531   SCHED_FUSION_NONE = 0,
14532   SCHED_FUSION_LD_SIGN_EXTEND,
14533   SCHED_FUSION_LD_ZERO_EXTEND,
14534   SCHED_FUSION_LD,
14535   SCHED_FUSION_ST,
14536   SCHED_FUSION_NUM
14537 };
14538
14539 /* If INSN is a load or store of address in the form of [base+offset],
14540    extract the two parts and set to BASE and OFFSET.  Return scheduling
14541    fusion type this INSN is.  */
14542
14543 static enum sched_fusion_type
14544 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14545 {
14546   rtx x, dest, src;
14547   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14548
14549   gcc_assert (INSN_P (insn));
14550   x = PATTERN (insn);
14551   if (GET_CODE (x) != SET)
14552     return SCHED_FUSION_NONE;
14553
14554   src = SET_SRC (x);
14555   dest = SET_DEST (x);
14556
14557   machine_mode dest_mode = GET_MODE (dest);
14558
14559   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14560     return SCHED_FUSION_NONE;
14561
14562   if (GET_CODE (src) == SIGN_EXTEND)
14563     {
14564       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14565       src = XEXP (src, 0);
14566       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14567         return SCHED_FUSION_NONE;
14568     }
14569   else if (GET_CODE (src) == ZERO_EXTEND)
14570     {
14571       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14572       src = XEXP (src, 0);
14573       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14574         return SCHED_FUSION_NONE;
14575     }
14576
14577   if (GET_CODE (src) == MEM && REG_P (dest))
14578     extract_base_offset_in_addr (src, base, offset);
14579   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14580     {
14581       fusion = SCHED_FUSION_ST;
14582       extract_base_offset_in_addr (dest, base, offset);
14583     }
14584   else
14585     return SCHED_FUSION_NONE;
14586
14587   if (*base == NULL_RTX || *offset == NULL_RTX)
14588     fusion = SCHED_FUSION_NONE;
14589
14590   return fusion;
14591 }
14592
14593 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14594
14595    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14596    and PRI are only calculated for these instructions.  For other instruction,
14597    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14598    type instruction fusion can be added by returning different priorities.
14599
14600    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14601
14602 static void
14603 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14604                                int *fusion_pri, int *pri)
14605 {
14606   int tmp, off_val;
14607   rtx base, offset;
14608   enum sched_fusion_type fusion;
14609
14610   gcc_assert (INSN_P (insn));
14611
14612   tmp = max_pri - 1;
14613   fusion = fusion_load_store (insn, &base, &offset);
14614   if (fusion == SCHED_FUSION_NONE)
14615     {
14616       *pri = tmp;
14617       *fusion_pri = tmp;
14618       return;
14619     }
14620
14621   /* Set FUSION_PRI according to fusion type and base register.  */
14622   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14623
14624   /* Calculate PRI.  */
14625   tmp /= 2;
14626
14627   /* INSN with smaller offset goes first.  */
14628   off_val = (int)(INTVAL (offset));
14629   if (off_val >= 0)
14630     tmp -= (off_val & 0xfffff);
14631   else
14632     tmp += ((- off_val) & 0xfffff);
14633
14634   *pri = tmp;
14635   return;
14636 }
14637
14638 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14639    Adjust priority of sha1h instructions so they are scheduled before
14640    other SHA1 instructions.  */
14641
14642 static int
14643 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14644 {
14645   rtx x = PATTERN (insn);
14646
14647   if (GET_CODE (x) == SET)
14648     {
14649       x = SET_SRC (x);
14650
14651       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14652         return priority + 10;
14653     }
14654
14655   return priority;
14656 }
14657
14658 /* Given OPERANDS of consecutive load/store, check if we can merge
14659    them into ldp/stp.  LOAD is true if they are load instructions.
14660    MODE is the mode of memory operands.  */
14661
14662 bool
14663 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14664                                 machine_mode mode)
14665 {
14666   HOST_WIDE_INT offval_1, offval_2, msize;
14667   enum reg_class rclass_1, rclass_2;
14668   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14669
14670   if (load)
14671     {
14672       mem_1 = operands[1];
14673       mem_2 = operands[3];
14674       reg_1 = operands[0];
14675       reg_2 = operands[2];
14676       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14677       if (REGNO (reg_1) == REGNO (reg_2))
14678         return false;
14679     }
14680   else
14681     {
14682       mem_1 = operands[0];
14683       mem_2 = operands[2];
14684       reg_1 = operands[1];
14685       reg_2 = operands[3];
14686     }
14687
14688   /* The mems cannot be volatile.  */
14689   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14690     return false;
14691
14692   /* If we have SImode and slow unaligned ldp,
14693      check the alignment to be at least 8 byte. */
14694   if (mode == SImode
14695       && (aarch64_tune_params.extra_tuning_flags
14696           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14697       && !optimize_size
14698       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14699     return false;
14700
14701   /* Check if the addresses are in the form of [base+offset].  */
14702   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14703   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14704     return false;
14705   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14706   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14707     return false;
14708
14709   /* Check if the bases are same.  */
14710   if (!rtx_equal_p (base_1, base_2))
14711     return false;
14712
14713   offval_1 = INTVAL (offset_1);
14714   offval_2 = INTVAL (offset_2);
14715   msize = GET_MODE_SIZE (mode);
14716   /* Check if the offsets are consecutive.  */
14717   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14718     return false;
14719
14720   /* Check if the addresses are clobbered by load.  */
14721   if (load)
14722     {
14723       if (reg_mentioned_p (reg_1, mem_1))
14724         return false;
14725
14726       /* In increasing order, the last load can clobber the address.  */
14727       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14728       return false;
14729     }
14730
14731   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14732     rclass_1 = FP_REGS;
14733   else
14734     rclass_1 = GENERAL_REGS;
14735
14736   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14737     rclass_2 = FP_REGS;
14738   else
14739     rclass_2 = GENERAL_REGS;
14740
14741   /* Check if the registers are of same class.  */
14742   if (rclass_1 != rclass_2)
14743     return false;
14744
14745   return true;
14746 }
14747
14748 /* Given OPERANDS of consecutive load/store, check if we can merge
14749    them into ldp/stp by adjusting the offset.  LOAD is true if they
14750    are load instructions.  MODE is the mode of memory operands.
14751
14752    Given below consecutive stores:
14753
14754      str  w1, [xb, 0x100]
14755      str  w1, [xb, 0x104]
14756      str  w1, [xb, 0x108]
14757      str  w1, [xb, 0x10c]
14758
14759    Though the offsets are out of the range supported by stp, we can
14760    still pair them after adjusting the offset, like:
14761
14762      add  scratch, xb, 0x100
14763      stp  w1, w1, [scratch]
14764      stp  w1, w1, [scratch, 0x8]
14765
14766    The peephole patterns detecting this opportunity should guarantee
14767    the scratch register is avaliable.  */
14768
14769 bool
14770 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14771                                        machine_mode mode)
14772 {
14773   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14774   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14775   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14776   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14777
14778   if (load)
14779     {
14780       reg_1 = operands[0];
14781       mem_1 = operands[1];
14782       reg_2 = operands[2];
14783       mem_2 = operands[3];
14784       reg_3 = operands[4];
14785       mem_3 = operands[5];
14786       reg_4 = operands[6];
14787       mem_4 = operands[7];
14788       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14789                   && REG_P (reg_3) && REG_P (reg_4));
14790       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14791         return false;
14792     }
14793   else
14794     {
14795       mem_1 = operands[0];
14796       reg_1 = operands[1];
14797       mem_2 = operands[2];
14798       reg_2 = operands[3];
14799       mem_3 = operands[4];
14800       reg_3 = operands[5];
14801       mem_4 = operands[6];
14802       reg_4 = operands[7];
14803     }
14804   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14805   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14806     return false;
14807
14808   /* The mems cannot be volatile.  */
14809   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14810       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14811     return false;
14812
14813   /* Check if the addresses are in the form of [base+offset].  */
14814   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14815   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14816     return false;
14817   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14818   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14819     return false;
14820   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14821   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14822     return false;
14823   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14824   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14825     return false;
14826
14827   /* Check if the bases are same.  */
14828   if (!rtx_equal_p (base_1, base_2)
14829       || !rtx_equal_p (base_2, base_3)
14830       || !rtx_equal_p (base_3, base_4))
14831     return false;
14832
14833   offval_1 = INTVAL (offset_1);
14834   offval_2 = INTVAL (offset_2);
14835   offval_3 = INTVAL (offset_3);
14836   offval_4 = INTVAL (offset_4);
14837   msize = GET_MODE_SIZE (mode);
14838   /* Check if the offsets are consecutive.  */
14839   if ((offval_1 != (offval_2 + msize)
14840        || offval_1 != (offval_3 + msize * 2)
14841        || offval_1 != (offval_4 + msize * 3))
14842       && (offval_4 != (offval_3 + msize)
14843           || offval_4 != (offval_2 + msize * 2)
14844           || offval_4 != (offval_1 + msize * 3)))
14845     return false;
14846
14847   /* Check if the addresses are clobbered by load.  */
14848   if (load)
14849     {
14850       if (reg_mentioned_p (reg_1, mem_1)
14851           || reg_mentioned_p (reg_2, mem_2)
14852           || reg_mentioned_p (reg_3, mem_3))
14853         return false;
14854
14855       /* In increasing order, the last load can clobber the address.  */
14856       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14857         return false;
14858     }
14859
14860   /* If we have SImode and slow unaligned ldp,
14861      check the alignment to be at least 8 byte. */
14862   if (mode == SImode
14863       && (aarch64_tune_params.extra_tuning_flags
14864           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14865       && !optimize_size
14866       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14867     return false;
14868
14869   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14870     rclass_1 = FP_REGS;
14871   else
14872     rclass_1 = GENERAL_REGS;
14873
14874   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14875     rclass_2 = FP_REGS;
14876   else
14877     rclass_2 = GENERAL_REGS;
14878
14879   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14880     rclass_3 = FP_REGS;
14881   else
14882     rclass_3 = GENERAL_REGS;
14883
14884   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14885     rclass_4 = FP_REGS;
14886   else
14887     rclass_4 = GENERAL_REGS;
14888
14889   /* Check if the registers are of same class.  */
14890   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14891     return false;
14892
14893   return true;
14894 }
14895
14896 /* Given OPERANDS of consecutive load/store, this function pairs them
14897    into ldp/stp after adjusting the offset.  It depends on the fact
14898    that addresses of load/store instructions are in increasing order.
14899    MODE is the mode of memory operands.  CODE is the rtl operator
14900    which should be applied to all memory operands, it's SIGN_EXTEND,
14901    ZERO_EXTEND or UNKNOWN.  */
14902
14903 bool
14904 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14905                              machine_mode mode, RTX_CODE code)
14906 {
14907   rtx base, offset, t1, t2;
14908   rtx mem_1, mem_2, mem_3, mem_4;
14909   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14910
14911   if (load)
14912     {
14913       mem_1 = operands[1];
14914       mem_2 = operands[3];
14915       mem_3 = operands[5];
14916       mem_4 = operands[7];
14917     }
14918   else
14919     {
14920       mem_1 = operands[0];
14921       mem_2 = operands[2];
14922       mem_3 = operands[4];
14923       mem_4 = operands[6];
14924       gcc_assert (code == UNKNOWN);
14925     }
14926
14927   extract_base_offset_in_addr (mem_1, &base, &offset);
14928   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14929
14930   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14931   msize = GET_MODE_SIZE (mode);
14932   stp_off_limit = msize * 0x40;
14933   off_val = INTVAL (offset);
14934   abs_off = (off_val < 0) ? -off_val : off_val;
14935   new_off = abs_off % stp_off_limit;
14936   adj_off = abs_off - new_off;
14937
14938   /* Further adjust to make sure all offsets are OK.  */
14939   if ((new_off + msize * 2) >= stp_off_limit)
14940     {
14941       adj_off += stp_off_limit;
14942       new_off -= stp_off_limit;
14943     }
14944
14945   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14946   if (adj_off >= 0x1000)
14947     return false;
14948
14949   if (off_val < 0)
14950     {
14951       adj_off = -adj_off;
14952       new_off = -new_off;
14953     }
14954
14955   /* Create new memory references.  */
14956   mem_1 = change_address (mem_1, VOIDmode,
14957                           plus_constant (DImode, operands[8], new_off));
14958
14959   /* Check if the adjusted address is OK for ldp/stp.  */
14960   if (!aarch64_mem_pair_operand (mem_1, mode))
14961     return false;
14962
14963   msize = GET_MODE_SIZE (mode);
14964   mem_2 = change_address (mem_2, VOIDmode,
14965                           plus_constant (DImode,
14966                                          operands[8],
14967                                          new_off + msize));
14968   mem_3 = change_address (mem_3, VOIDmode,
14969                           plus_constant (DImode,
14970                                          operands[8],
14971                                          new_off + msize * 2));
14972   mem_4 = change_address (mem_4, VOIDmode,
14973                           plus_constant (DImode,
14974                                          operands[8],
14975                                          new_off + msize * 3));
14976
14977   if (code == ZERO_EXTEND)
14978     {
14979       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14980       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14981       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14982       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14983     }
14984   else if (code == SIGN_EXTEND)
14985     {
14986       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14987       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14988       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14989       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14990     }
14991
14992   if (load)
14993     {
14994       operands[1] = mem_1;
14995       operands[3] = mem_2;
14996       operands[5] = mem_3;
14997       operands[7] = mem_4;
14998     }
14999   else
15000     {
15001       operands[0] = mem_1;
15002       operands[2] = mem_2;
15003       operands[4] = mem_3;
15004       operands[6] = mem_4;
15005     }
15006
15007   /* Emit adjusting instruction.  */
15008   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15009   /* Emit ldp/stp instructions.  */
15010   t1 = gen_rtx_SET (operands[0], operands[1]);
15011   t2 = gen_rtx_SET (operands[2], operands[3]);
15012   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15013   t1 = gen_rtx_SET (operands[4], operands[5]);
15014   t2 = gen_rtx_SET (operands[6], operands[7]);
15015   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15016   return true;
15017 }
15018
15019 /* Return 1 if pseudo register should be created and used to hold
15020    GOT address for PIC code.  */
15021
15022 bool
15023 aarch64_use_pseudo_pic_reg (void)
15024 {
15025   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15026 }
15027
15028 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15029
15030 static int
15031 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15032 {
15033   switch (XINT (x, 1))
15034     {
15035     case UNSPEC_GOTSMALLPIC:
15036     case UNSPEC_GOTSMALLPIC28K:
15037     case UNSPEC_GOTTINYPIC:
15038       return 0;
15039     default:
15040       break;
15041     }
15042
15043   return default_unspec_may_trap_p (x, flags);
15044 }
15045
15046
15047 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15048    return the log2 of that value.  Otherwise return -1.  */
15049
15050 int
15051 aarch64_fpconst_pow_of_2 (rtx x)
15052 {
15053   const REAL_VALUE_TYPE *r;
15054
15055   if (!CONST_DOUBLE_P (x))
15056     return -1;
15057
15058   r = CONST_DOUBLE_REAL_VALUE (x);
15059
15060   if (REAL_VALUE_NEGATIVE (*r)
15061       || REAL_VALUE_ISNAN (*r)
15062       || REAL_VALUE_ISINF (*r)
15063       || !real_isinteger (r, DFmode))
15064     return -1;
15065
15066   return exact_log2 (real_to_integer (r));
15067 }
15068
15069 /* If X is a vector of equal CONST_DOUBLE values and that value is
15070    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15071
15072 int
15073 aarch64_vec_fpconst_pow_of_2 (rtx x)
15074 {
15075   if (GET_CODE (x) != CONST_VECTOR)
15076     return -1;
15077
15078   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15079     return -1;
15080
15081   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15082   if (firstval <= 0)
15083     return -1;
15084
15085   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15086     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15087       return -1;
15088
15089   return firstval;
15090 }
15091
15092 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15093    to float.
15094
15095    __fp16 always promotes through this hook.
15096    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15097    through the generic excess precision logic rather than here.  */
15098
15099 static tree
15100 aarch64_promoted_type (const_tree t)
15101 {
15102   if (SCALAR_FLOAT_TYPE_P (t)
15103       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15104     return float_type_node;
15105
15106   return NULL_TREE;
15107 }
15108
15109 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15110
15111 static bool
15112 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15113                            optimization_type opt_type)
15114 {
15115   switch (op)
15116     {
15117     case rsqrt_optab:
15118       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15119
15120     default:
15121       return true;
15122     }
15123 }
15124
15125 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15126    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15127
15128 static bool
15129 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
15130 {
15131   return (mode == HFmode
15132           ? true
15133           : default_libgcc_floating_mode_supported_p (mode));
15134 }
15135
15136 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15137    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15138
15139 static bool
15140 aarch64_scalar_mode_supported_p (machine_mode mode)
15141 {
15142   return (mode == HFmode
15143           ? true
15144           : default_scalar_mode_supported_p (mode));
15145 }
15146
15147 /* Set the value of FLT_EVAL_METHOD.
15148    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15149
15150     0: evaluate all operations and constants, whose semantic type has at
15151        most the range and precision of type float, to the range and
15152        precision of float; evaluate all other operations and constants to
15153        the range and precision of the semantic type;
15154
15155     N, where _FloatN is a supported interchange floating type
15156        evaluate all operations and constants, whose semantic type has at
15157        most the range and precision of _FloatN type, to the range and
15158        precision of the _FloatN type; evaluate all other operations and
15159        constants to the range and precision of the semantic type;
15160
15161    If we have the ARMv8.2-A extensions then we support _Float16 in native
15162    precision, so we should set this to 16.  Otherwise, we support the type,
15163    but want to evaluate expressions in float precision, so set this to
15164    0.  */
15165
15166 static enum flt_eval_method
15167 aarch64_excess_precision (enum excess_precision_type type)
15168 {
15169   switch (type)
15170     {
15171       case EXCESS_PRECISION_TYPE_FAST:
15172       case EXCESS_PRECISION_TYPE_STANDARD:
15173         /* We can calculate either in 16-bit range and precision or
15174            32-bit range and precision.  Make that decision based on whether
15175            we have native support for the ARMv8.2-A 16-bit floating-point
15176            instructions or not.  */
15177         return (TARGET_FP_F16INST
15178                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15179                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15180       case EXCESS_PRECISION_TYPE_IMPLICIT:
15181         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15182       default:
15183         gcc_unreachable ();
15184     }
15185   return FLT_EVAL_METHOD_UNPREDICTABLE;
15186 }
15187
15188 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15189    scheduled for speculative execution.  Reject the long-running division
15190    and square-root instructions.  */
15191
15192 static bool
15193 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15194 {
15195   switch (get_attr_type (insn))
15196     {
15197       case TYPE_SDIV:
15198       case TYPE_UDIV:
15199       case TYPE_FDIVS:
15200       case TYPE_FDIVD:
15201       case TYPE_FSQRTS:
15202       case TYPE_FSQRTD:
15203       case TYPE_NEON_FP_SQRT_S:
15204       case TYPE_NEON_FP_SQRT_D:
15205       case TYPE_NEON_FP_SQRT_S_Q:
15206       case TYPE_NEON_FP_SQRT_D_Q:
15207       case TYPE_NEON_FP_DIV_S:
15208       case TYPE_NEON_FP_DIV_D:
15209       case TYPE_NEON_FP_DIV_S_Q:
15210       case TYPE_NEON_FP_DIV_D_Q:
15211         return false;
15212       default:
15213         return true;
15214     }
15215 }
15216
15217 /* Target-specific selftests.  */
15218
15219 #if CHECKING_P
15220
15221 namespace selftest {
15222
15223 /* Selftest for the RTL loader.
15224    Verify that the RTL loader copes with a dump from
15225    print_rtx_function.  This is essentially just a test that class
15226    function_reader can handle a real dump, but it also verifies
15227    that lookup_reg_by_dump_name correctly handles hard regs.
15228    The presence of hard reg names in the dump means that the test is
15229    target-specific, hence it is in this file.  */
15230
15231 static void
15232 aarch64_test_loading_full_dump ()
15233 {
15234   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15235
15236   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15237
15238   rtx_insn *insn_1 = get_insn_by_uid (1);
15239   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15240
15241   rtx_insn *insn_15 = get_insn_by_uid (15);
15242   ASSERT_EQ (INSN, GET_CODE (insn_15));
15243   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15244
15245   /* Verify crtl->return_rtx.  */
15246   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15247   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15248   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15249 }
15250
15251 /* Run all target-specific selftests.  */
15252
15253 static void
15254 aarch64_run_selftests (void)
15255 {
15256   aarch64_test_loading_full_dump ();
15257 }
15258
15259 } // namespace selftest
15260
15261 #endif /* #if CHECKING_P */
15262
15263 #undef TARGET_ADDRESS_COST
15264 #define TARGET_ADDRESS_COST aarch64_address_cost
15265
15266 /* This hook will determines whether unnamed bitfields affect the alignment
15267    of the containing structure.  The hook returns true if the structure
15268    should inherit the alignment requirements of an unnamed bitfield's
15269    type.  */
15270 #undef TARGET_ALIGN_ANON_BITFIELD
15271 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15272
15273 #undef TARGET_ASM_ALIGNED_DI_OP
15274 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15275
15276 #undef TARGET_ASM_ALIGNED_HI_OP
15277 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15278
15279 #undef TARGET_ASM_ALIGNED_SI_OP
15280 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15281
15282 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15283 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15284   hook_bool_const_tree_hwi_hwi_const_tree_true
15285
15286 #undef TARGET_ASM_FILE_START
15287 #define TARGET_ASM_FILE_START aarch64_start_file
15288
15289 #undef TARGET_ASM_OUTPUT_MI_THUNK
15290 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15291
15292 #undef TARGET_ASM_SELECT_RTX_SECTION
15293 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15294
15295 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15296 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15297
15298 #undef TARGET_BUILD_BUILTIN_VA_LIST
15299 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15300
15301 #undef TARGET_CALLEE_COPIES
15302 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15303
15304 #undef TARGET_CAN_ELIMINATE
15305 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15306
15307 #undef TARGET_CAN_INLINE_P
15308 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15309
15310 #undef TARGET_CANNOT_FORCE_CONST_MEM
15311 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15312
15313 #undef TARGET_CASE_VALUES_THRESHOLD
15314 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15315
15316 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15317 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15318
15319 /* Only the least significant bit is used for initialization guard
15320    variables.  */
15321 #undef TARGET_CXX_GUARD_MASK_BIT
15322 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15323
15324 #undef TARGET_C_MODE_FOR_SUFFIX
15325 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15326
15327 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15328 #undef  TARGET_DEFAULT_TARGET_FLAGS
15329 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15330 #endif
15331
15332 #undef TARGET_CLASS_MAX_NREGS
15333 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15334
15335 #undef TARGET_BUILTIN_DECL
15336 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15337
15338 #undef TARGET_BUILTIN_RECIPROCAL
15339 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15340
15341 #undef TARGET_C_EXCESS_PRECISION
15342 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15343
15344 #undef  TARGET_EXPAND_BUILTIN
15345 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15346
15347 #undef TARGET_EXPAND_BUILTIN_VA_START
15348 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15349
15350 #undef TARGET_FOLD_BUILTIN
15351 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15352
15353 #undef TARGET_FUNCTION_ARG
15354 #define TARGET_FUNCTION_ARG aarch64_function_arg
15355
15356 #undef TARGET_FUNCTION_ARG_ADVANCE
15357 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15358
15359 #undef TARGET_FUNCTION_ARG_BOUNDARY
15360 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15361
15362 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15363 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15364
15365 #undef TARGET_FUNCTION_VALUE
15366 #define TARGET_FUNCTION_VALUE aarch64_function_value
15367
15368 #undef TARGET_FUNCTION_VALUE_REGNO_P
15369 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15370
15371 #undef TARGET_FRAME_POINTER_REQUIRED
15372 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15373
15374 #undef TARGET_GIMPLE_FOLD_BUILTIN
15375 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15376
15377 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15378 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15379
15380 #undef  TARGET_INIT_BUILTINS
15381 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15382
15383 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15384 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15385   aarch64_ira_change_pseudo_allocno_class
15386
15387 #undef TARGET_LEGITIMATE_ADDRESS_P
15388 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15389
15390 #undef TARGET_LEGITIMATE_CONSTANT_P
15391 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15392
15393 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15394 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15395   aarch64_legitimize_address_displacement
15396
15397 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15398 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15399
15400 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15401 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15402 aarch64_libgcc_floating_mode_supported_p
15403
15404 #undef TARGET_MANGLE_TYPE
15405 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15406
15407 #undef TARGET_MEMORY_MOVE_COST
15408 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15409
15410 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15411 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15412
15413 #undef TARGET_MUST_PASS_IN_STACK
15414 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15415
15416 /* This target hook should return true if accesses to volatile bitfields
15417    should use the narrowest mode possible.  It should return false if these
15418    accesses should use the bitfield container type.  */
15419 #undef TARGET_NARROW_VOLATILE_BITFIELD
15420 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15421
15422 #undef  TARGET_OPTION_OVERRIDE
15423 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15424
15425 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15426 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15427   aarch64_override_options_after_change
15428
15429 #undef TARGET_OPTION_SAVE
15430 #define TARGET_OPTION_SAVE aarch64_option_save
15431
15432 #undef TARGET_OPTION_RESTORE
15433 #define TARGET_OPTION_RESTORE aarch64_option_restore
15434
15435 #undef TARGET_OPTION_PRINT
15436 #define TARGET_OPTION_PRINT aarch64_option_print
15437
15438 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15439 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15440
15441 #undef TARGET_SET_CURRENT_FUNCTION
15442 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15443
15444 #undef TARGET_PASS_BY_REFERENCE
15445 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15446
15447 #undef TARGET_PREFERRED_RELOAD_CLASS
15448 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15449
15450 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15451 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15452
15453 #undef TARGET_PROMOTED_TYPE
15454 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15455
15456 #undef TARGET_SECONDARY_RELOAD
15457 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15458
15459 #undef TARGET_SHIFT_TRUNCATION_MASK
15460 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15461
15462 #undef TARGET_SETUP_INCOMING_VARARGS
15463 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15464
15465 #undef TARGET_STRUCT_VALUE_RTX
15466 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15467
15468 #undef TARGET_REGISTER_MOVE_COST
15469 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15470
15471 #undef TARGET_RETURN_IN_MEMORY
15472 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15473
15474 #undef TARGET_RETURN_IN_MSB
15475 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15476
15477 #undef TARGET_RTX_COSTS
15478 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15479
15480 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15481 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15482
15483 #undef TARGET_SCHED_ISSUE_RATE
15484 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15485
15486 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15487 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15488   aarch64_sched_first_cycle_multipass_dfa_lookahead
15489
15490 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15491 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15492   aarch64_first_cycle_multipass_dfa_lookahead_guard
15493
15494 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15495 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15496   aarch64_get_separate_components
15497
15498 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15499 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15500   aarch64_components_for_bb
15501
15502 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15503 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15504   aarch64_disqualify_components
15505
15506 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15507 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15508   aarch64_emit_prologue_components
15509
15510 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15511 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15512   aarch64_emit_epilogue_components
15513
15514 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15515 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15516   aarch64_set_handled_components
15517
15518 #undef TARGET_TRAMPOLINE_INIT
15519 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15520
15521 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15522 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15523
15524 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15525 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15526
15527 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15528 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15529   aarch64_builtin_support_vector_misalignment
15530
15531 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15532 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15533
15534 #undef TARGET_VECTORIZE_ADD_STMT_COST
15535 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15536
15537 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15538 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15539   aarch64_builtin_vectorization_cost
15540
15541 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15542 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15543
15544 #undef TARGET_VECTORIZE_BUILTINS
15545 #define TARGET_VECTORIZE_BUILTINS
15546
15547 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15548 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15549   aarch64_builtin_vectorized_function
15550
15551 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15552 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15553   aarch64_autovectorize_vector_sizes
15554
15555 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15556 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15557   aarch64_atomic_assign_expand_fenv
15558
15559 /* Section anchor support.  */
15560
15561 #undef TARGET_MIN_ANCHOR_OFFSET
15562 #define TARGET_MIN_ANCHOR_OFFSET -256
15563
15564 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15565    byte offset; we can do much more for larger data types, but have no way
15566    to determine the size of the access.  We assume accesses are aligned.  */
15567 #undef TARGET_MAX_ANCHOR_OFFSET
15568 #define TARGET_MAX_ANCHOR_OFFSET 4095
15569
15570 #undef TARGET_VECTOR_ALIGNMENT
15571 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15572
15573 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15574 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15575   aarch64_simd_vector_alignment_reachable
15576
15577 /* vec_perm support.  */
15578
15579 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15580 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15581   aarch64_vectorize_vec_perm_const_ok
15582
15583 #undef TARGET_INIT_LIBFUNCS
15584 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15585
15586 #undef TARGET_FIXED_CONDITION_CODE_REGS
15587 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15588
15589 #undef TARGET_FLAGS_REGNUM
15590 #define TARGET_FLAGS_REGNUM CC_REGNUM
15591
15592 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15593 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15594
15595 #undef TARGET_ASAN_SHADOW_OFFSET
15596 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15597
15598 #undef TARGET_LEGITIMIZE_ADDRESS
15599 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15600
15601 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15602 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15603   aarch64_use_by_pieces_infrastructure_p
15604
15605 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15606 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15607
15608 #undef TARGET_CAN_USE_DOLOOP_P
15609 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15610
15611 #undef TARGET_SCHED_ADJUST_PRIORITY
15612 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15613
15614 #undef TARGET_SCHED_MACRO_FUSION_P
15615 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15616
15617 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15618 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15619
15620 #undef TARGET_SCHED_FUSION_PRIORITY
15621 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15622
15623 #undef TARGET_UNSPEC_MAY_TRAP_P
15624 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15625
15626 #undef TARGET_USE_PSEUDO_PIC_REG
15627 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15628
15629 #undef TARGET_PRINT_OPERAND
15630 #define TARGET_PRINT_OPERAND aarch64_print_operand
15631
15632 #undef TARGET_PRINT_OPERAND_ADDRESS
15633 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15634
15635 #undef TARGET_OPTAB_SUPPORTED_P
15636 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15637
15638 #undef TARGET_OMIT_STRUCT_RETURN_REG
15639 #define TARGET_OMIT_STRUCT_RETURN_REG true
15640
15641 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15642 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15643 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15644
15645 #if CHECKING_P
15646 #undef TARGET_RUN_TARGET_SELFTESTS
15647 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15648 #endif /* #if CHECKING_P */
15649
15650 struct gcc_target targetm = TARGET_INITIALIZER;
15651
15652 #include "gt-aarch64.h"