gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode,
 145                                                  vec_perm_indices);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
 153 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 154
 155 /* Major revision number of the ARM Architecture implemented by the target.  */
 156 unsigned aarch64_architecture_version;
 157
 158 /* The processor for which instructions should be scheduled.  */
 159 enum aarch64_processor aarch64_tune = cortexa53;
 160
 161 /* Mask to specify which instruction scheduling options should be used.  */
 162 unsigned long aarch64_tune_flags = 0;
 163
 164 /* Global flag for PC relative loads.  */
 165 bool aarch64_pcrelative_literal_loads;
 166
 167 /* Support for command line parsing of boolean flags in the tuning
 168    structures.  */
 169 struct aarch64_flag_desc
 170 {
 171   const char* name;
 172   unsigned int flag;
 173 };
 174
 175 #define AARCH64_FUSION_PAIR(name, internal_name) \
 176   { name, AARCH64_FUSE_##internal_name },
 177 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 178 {
 179   { "none", AARCH64_FUSE_NOTHING },
 180 #include "aarch64-fusion-pairs.def"
 181   { "all", AARCH64_FUSE_ALL },
 182   { NULL, AARCH64_FUSE_NOTHING }
 183 };
 184
 185 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 186   { name, AARCH64_EXTRA_TUNE_##internal_name },
 187 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 188 {
 189   { "none", AARCH64_EXTRA_TUNE_NONE },
 190 #include "aarch64-tuning-flags.def"
 191   { "all", AARCH64_EXTRA_TUNE_ALL },
 192   { NULL, AARCH64_EXTRA_TUNE_NONE }
 193 };
 194
 195 /* Tuning parameters.  */
 196
 197 static const struct cpu_addrcost_table generic_addrcost_table =
 198 {
 199     {
 200       1, /* hi  */
 201       0, /* si  */
 202       0, /* di  */
 203       1, /* ti  */
 204     },
 205   0, /* pre_modify  */
 206   0, /* post_modify  */
 207   0, /* register_offset  */
 208   0, /* register_sextend  */
 209   0, /* register_zextend  */
 210   0 /* imm_offset  */
 211 };
 212
 213 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 214 {
 215     {
 216       0, /* hi  */
 217       0, /* si  */
 218       0, /* di  */
 219       2, /* ti  */
 220     },
 221   0, /* pre_modify  */
 222   0, /* post_modify  */
 223   1, /* register_offset  */
 224   1, /* register_sextend  */
 225   2, /* register_zextend  */
 226   0, /* imm_offset  */
 227 };
 228
 229 static const struct cpu_addrcost_table xgene1_addrcost_table =
 230 {
 231     {
 232       1, /* hi  */
 233       0, /* si  */
 234       0, /* di  */
 235       1, /* ti  */
 236     },
 237   1, /* pre_modify  */
 238   0, /* post_modify  */
 239   0, /* register_offset  */
 240   1, /* register_sextend  */
 241   1, /* register_zextend  */
 242   0, /* imm_offset  */
 243 };
 244
 245 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 246 {
 247     {
 248       1, /* hi  */
 249       1, /* si  */
 250       1, /* di  */
 251       2, /* ti  */
 252     },
 253   0, /* pre_modify  */
 254   0, /* post_modify  */
 255   2, /* register_offset  */
 256   3, /* register_sextend  */
 257   3, /* register_zextend  */
 258   0, /* imm_offset  */
 259 };
 260
 261 static const struct cpu_regmove_cost generic_regmove_cost =
 262 {
 263   1, /* GP2GP  */
 264   /* Avoid the use of slow int<->fp moves for spilling by setting
 265      their cost higher than memmov_cost.  */
 266   5, /* GP2FP  */
 267   5, /* FP2GP  */
 268   2 /* FP2FP  */
 269 };
 270
 271 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 272 {
 273   1, /* GP2GP  */
 274   /* Avoid the use of slow int<->fp moves for spilling by setting
 275      their cost higher than memmov_cost.  */
 276   5, /* GP2FP  */
 277   5, /* FP2GP  */
 278   2 /* FP2FP  */
 279 };
 280
 281 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 282 {
 283   1, /* GP2GP  */
 284   /* Avoid the use of slow int<->fp moves for spilling by setting
 285      their cost higher than memmov_cost.  */
 286   5, /* GP2FP  */
 287   5, /* FP2GP  */
 288   2 /* FP2FP  */
 289 };
 290
 291 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 292 {
 293   1, /* GP2GP  */
 294   /* Avoid the use of slow int<->fp moves for spilling by setting
 295      their cost higher than memmov_cost (actual, 4 and 9).  */
 296   9, /* GP2FP  */
 297   9, /* FP2GP  */
 298   1 /* FP2FP  */
 299 };
 300
 301 static const struct cpu_regmove_cost thunderx_regmove_cost =
 302 {
 303   2, /* GP2GP  */
 304   2, /* GP2FP  */
 305   6, /* FP2GP  */
 306   4 /* FP2FP  */
 307 };
 308
 309 static const struct cpu_regmove_cost xgene1_regmove_cost =
 310 {
 311   1, /* GP2GP  */
 312   /* Avoid the use of slow int<->fp moves for spilling by setting
 313      their cost higher than memmov_cost.  */
 314   8, /* GP2FP  */
 315   8, /* FP2GP  */
 316   2 /* FP2FP  */
 317 };
 318
 319 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 320 {
 321   2, /* GP2GP  */
 322   /* Avoid the use of int<->fp moves for spilling.  */
 323   6, /* GP2FP  */
 324   6, /* FP2GP  */
 325   4 /* FP2FP  */
 326 };
 327
 328 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 329 {
 330   1, /* GP2GP  */
 331   /* Avoid the use of int<->fp moves for spilling.  */
 332   8, /* GP2FP  */
 333   8, /* FP2GP  */
 334   4  /* FP2FP  */
 335 };
 336
 337 /* Generic costs for vector insn classes.  */
 338 static const struct cpu_vector_cost generic_vector_cost =
 339 {
 340   1, /* scalar_int_stmt_cost  */
 341   1, /* scalar_fp_stmt_cost  */
 342   1, /* scalar_load_cost  */
 343   1, /* scalar_store_cost  */
 344   1, /* vec_int_stmt_cost  */
 345   1, /* vec_fp_stmt_cost  */
 346   2, /* vec_permute_cost  */
 347   1, /* vec_to_scalar_cost  */
 348   1, /* scalar_to_vec_cost  */
 349   1, /* vec_align_load_cost  */
 350   1, /* vec_unalign_load_cost  */
 351   1, /* vec_unalign_store_cost  */
 352   1, /* vec_store_cost  */
 353   3, /* cond_taken_branch_cost  */
 354   1 /* cond_not_taken_branch_cost  */
 355 };
 356
 357 /* ThunderX costs for vector insn classes.  */
 358 static const struct cpu_vector_cost thunderx_vector_cost =
 359 {
 360   1, /* scalar_int_stmt_cost  */
 361   1, /* scalar_fp_stmt_cost  */
 362   3, /* scalar_load_cost  */
 363   1, /* scalar_store_cost  */
 364   4, /* vec_int_stmt_cost  */
 365   1, /* vec_fp_stmt_cost  */
 366   4, /* vec_permute_cost  */
 367   2, /* vec_to_scalar_cost  */
 368   2, /* scalar_to_vec_cost  */
 369   3, /* vec_align_load_cost  */
 370   5, /* vec_unalign_load_cost  */
 371   5, /* vec_unalign_store_cost  */
 372   1, /* vec_store_cost  */
 373   3, /* cond_taken_branch_cost  */
 374   3 /* cond_not_taken_branch_cost  */
 375 };
 376
 377 /* Generic costs for vector insn classes.  */
 378 static const struct cpu_vector_cost cortexa57_vector_cost =
 379 {
 380   1, /* scalar_int_stmt_cost  */
 381   1, /* scalar_fp_stmt_cost  */
 382   4, /* scalar_load_cost  */
 383   1, /* scalar_store_cost  */
 384   2, /* vec_int_stmt_cost  */
 385   2, /* vec_fp_stmt_cost  */
 386   3, /* vec_permute_cost  */
 387   8, /* vec_to_scalar_cost  */
 388   8, /* scalar_to_vec_cost  */
 389   4, /* vec_align_load_cost  */
 390   4, /* vec_unalign_load_cost  */
 391   1, /* vec_unalign_store_cost  */
 392   1, /* vec_store_cost  */
 393   1, /* cond_taken_branch_cost  */
 394   1 /* cond_not_taken_branch_cost  */
 395 };
 396
 397 static const struct cpu_vector_cost exynosm1_vector_cost =
 398 {
 399   1, /* scalar_int_stmt_cost  */
 400   1, /* scalar_fp_stmt_cost  */
 401   5, /* scalar_load_cost  */
 402   1, /* scalar_store_cost  */
 403   3, /* vec_int_stmt_cost  */
 404   3, /* vec_fp_stmt_cost  */
 405   3, /* vec_permute_cost  */
 406   3, /* vec_to_scalar_cost  */
 407   3, /* scalar_to_vec_cost  */
 408   5, /* vec_align_load_cost  */
 409   5, /* vec_unalign_load_cost  */
 410   1, /* vec_unalign_store_cost  */
 411   1, /* vec_store_cost  */
 412   1, /* cond_taken_branch_cost  */
 413   1 /* cond_not_taken_branch_cost  */
 414 };
 415
 416 /* Generic costs for vector insn classes.  */
 417 static const struct cpu_vector_cost xgene1_vector_cost =
 418 {
 419   1, /* scalar_int_stmt_cost  */
 420   1, /* scalar_fp_stmt_cost  */
 421   5, /* scalar_load_cost  */
 422   1, /* scalar_store_cost  */
 423   2, /* vec_int_stmt_cost  */
 424   2, /* vec_fp_stmt_cost  */
 425   2, /* vec_permute_cost  */
 426   4, /* vec_to_scalar_cost  */
 427   4, /* scalar_to_vec_cost  */
 428   10, /* vec_align_load_cost  */
 429   10, /* vec_unalign_load_cost  */
 430   2, /* vec_unalign_store_cost  */
 431   2, /* vec_store_cost  */
 432   2, /* cond_taken_branch_cost  */
 433   1 /* cond_not_taken_branch_cost  */
 434 };
 435
 436 /* Costs for vector insn classes for Vulcan.  */
 437 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 438 {
 439   1, /* scalar_int_stmt_cost  */
 440   6, /* scalar_fp_stmt_cost  */
 441   4, /* scalar_load_cost  */
 442   1, /* scalar_store_cost  */
 443   5, /* vec_int_stmt_cost  */
 444   6, /* vec_fp_stmt_cost  */
 445   3, /* vec_permute_cost  */
 446   6, /* vec_to_scalar_cost  */
 447   5, /* scalar_to_vec_cost  */
 448   8, /* vec_align_load_cost  */
 449   8, /* vec_unalign_load_cost  */
 450   4, /* vec_unalign_store_cost  */
 451   4, /* vec_store_cost  */
 452   2, /* cond_taken_branch_cost  */
 453   1  /* cond_not_taken_branch_cost  */
 454 };
 455
 456 /* Generic costs for branch instructions.  */
 457 static const struct cpu_branch_cost generic_branch_cost =
 458 {
 459   1,  /* Predictable.  */
 460   3   /* Unpredictable.  */
 461 };
 462
 463 /* Generic approximation modes.  */
 464 static const cpu_approx_modes generic_approx_modes =
 465 {
 466   AARCH64_APPROX_NONE,  /* division  */
 467   AARCH64_APPROX_NONE,  /* sqrt  */
 468   AARCH64_APPROX_NONE   /* recip_sqrt  */
 469 };
 470
 471 /* Approximation modes for Exynos M1.  */
 472 static const cpu_approx_modes exynosm1_approx_modes =
 473 {
 474   AARCH64_APPROX_NONE,  /* division  */
 475   AARCH64_APPROX_ALL,   /* sqrt  */
 476   AARCH64_APPROX_ALL    /* recip_sqrt  */
 477 };
 478
 479 /* Approximation modes for X-Gene 1.  */
 480 static const cpu_approx_modes xgene1_approx_modes =
 481 {
 482   AARCH64_APPROX_NONE,  /* division  */
 483   AARCH64_APPROX_NONE,  /* sqrt  */
 484   AARCH64_APPROX_ALL    /* recip_sqrt  */
 485 };
 486
 487 /* Generic prefetch settings (which disable prefetch).  */
 488 static const cpu_prefetch_tune generic_prefetch_tune =
 489 {
 490   0,                    /* num_slots  */
 491   -1,                   /* l1_cache_size  */
 492   -1,                   /* l1_cache_line_size  */
 493   -1,                   /* l2_cache_size  */
 494   -1                    /* default_opt_level  */
 495 };
 496
 497 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 498 {
 499   0,                    /* num_slots  */
 500   -1,                   /* l1_cache_size  */
 501   64,                   /* l1_cache_line_size  */
 502   -1,                   /* l2_cache_size  */
 503   -1                    /* default_opt_level  */
 504 };
 505
 506 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 507 {
 508   4,                    /* num_slots  */
 509   32,                   /* l1_cache_size  */
 510   64,                   /* l1_cache_line_size  */
 511   1024,                 /* l2_cache_size  */
 512   -1                    /* default_opt_level  */
 513 };
 514
 515 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 516 {
 517   8,                    /* num_slots  */
 518   32,                   /* l1_cache_size  */
 519   128,                  /* l1_cache_line_size  */
 520   16*1024,              /* l2_cache_size  */
 521   3                     /* default_opt_level  */
 522 };
 523
 524 static const cpu_prefetch_tune thunderx_prefetch_tune =
 525 {
 526   8,                    /* num_slots  */
 527   32,                   /* l1_cache_size  */
 528   128,                  /* l1_cache_line_size  */
 529   -1,                   /* l2_cache_size  */
 530   -1                    /* default_opt_level  */
 531 };
 532
 533 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 534 {
 535   8,                    /* num_slots  */
 536   32,                   /* l1_cache_size  */
 537   64,                   /* l1_cache_line_size  */
 538   256,                  /* l2_cache_size  */
 539   -1                    /* default_opt_level  */
 540 };
 541
 542 static const struct tune_params generic_tunings =
 543 {
 544   &cortexa57_extra_costs,
 545   &generic_addrcost_table,
 546   &generic_regmove_cost,
 547   &generic_vector_cost,
 548   &generic_branch_cost,
 549   &generic_approx_modes,
 550   4, /* memmov_cost  */
 551   2, /* issue_rate  */
 552   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 553   8,    /* function_align.  */
 554   4,    /* jump_align.  */
 555   8,    /* loop_align.  */
 556   2,    /* int_reassoc_width.  */
 557   4,    /* fp_reassoc_width.  */
 558   1,    /* vec_reassoc_width.  */
 559   2,    /* min_div_recip_mul_sf.  */
 560   2,    /* min_div_recip_mul_df.  */
 561   0,    /* max_case_values.  */
 562   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 563   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 564   &generic_prefetch_tune
 565 };
 566
 567 static const struct tune_params cortexa35_tunings =
 568 {
 569   &cortexa53_extra_costs,
 570   &generic_addrcost_table,
 571   &cortexa53_regmove_cost,
 572   &generic_vector_cost,
 573   &generic_branch_cost,
 574   &generic_approx_modes,
 575   4, /* memmov_cost  */
 576   1, /* issue_rate  */
 577   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 578    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 579   16,   /* function_align.  */
 580   4,    /* jump_align.  */
 581   8,    /* loop_align.  */
 582   2,    /* int_reassoc_width.  */
 583   4,    /* fp_reassoc_width.  */
 584   1,    /* vec_reassoc_width.  */
 585   2,    /* min_div_recip_mul_sf.  */
 586   2,    /* min_div_recip_mul_df.  */
 587   0,    /* max_case_values.  */
 588   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 589   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 590   &generic_prefetch_tune
 591 };
 592
 593 static const struct tune_params cortexa53_tunings =
 594 {
 595   &cortexa53_extra_costs,
 596   &generic_addrcost_table,
 597   &cortexa53_regmove_cost,
 598   &generic_vector_cost,
 599   &generic_branch_cost,
 600   &generic_approx_modes,
 601   4, /* memmov_cost  */
 602   2, /* issue_rate  */
 603   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 604    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 605   16,   /* function_align.  */
 606   4,    /* jump_align.  */
 607   8,    /* loop_align.  */
 608   2,    /* int_reassoc_width.  */
 609   4,    /* fp_reassoc_width.  */
 610   1,    /* vec_reassoc_width.  */
 611   2,    /* min_div_recip_mul_sf.  */
 612   2,    /* min_div_recip_mul_df.  */
 613   0,    /* max_case_values.  */
 614   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 615   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 616   &generic_prefetch_tune
 617 };
 618
 619 static const struct tune_params cortexa57_tunings =
 620 {
 621   &cortexa57_extra_costs,
 622   &generic_addrcost_table,
 623   &cortexa57_regmove_cost,
 624   &cortexa57_vector_cost,
 625   &generic_branch_cost,
 626   &generic_approx_modes,
 627   4, /* memmov_cost  */
 628   3, /* issue_rate  */
 629   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 630    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 631   16,   /* function_align.  */
 632   4,    /* jump_align.  */
 633   8,    /* loop_align.  */
 634   2,    /* int_reassoc_width.  */
 635   4,    /* fp_reassoc_width.  */
 636   1,    /* vec_reassoc_width.  */
 637   2,    /* min_div_recip_mul_sf.  */
 638   2,    /* min_div_recip_mul_df.  */
 639   0,    /* max_case_values.  */
 640   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 641   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 642   &generic_prefetch_tune
 643 };
 644
 645 static const struct tune_params cortexa72_tunings =
 646 {
 647   &cortexa57_extra_costs,
 648   &generic_addrcost_table,
 649   &cortexa57_regmove_cost,
 650   &cortexa57_vector_cost,
 651   &generic_branch_cost,
 652   &generic_approx_modes,
 653   4, /* memmov_cost  */
 654   3, /* issue_rate  */
 655   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 656    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 657   16,   /* function_align.  */
 658   4,    /* jump_align.  */
 659   8,    /* loop_align.  */
 660   2,    /* int_reassoc_width.  */
 661   4,    /* fp_reassoc_width.  */
 662   1,    /* vec_reassoc_width.  */
 663   2,    /* min_div_recip_mul_sf.  */
 664   2,    /* min_div_recip_mul_df.  */
 665   0,    /* max_case_values.  */
 666   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 667   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 668   &generic_prefetch_tune
 669 };
 670
 671 static const struct tune_params cortexa73_tunings =
 672 {
 673   &cortexa57_extra_costs,
 674   &generic_addrcost_table,
 675   &cortexa57_regmove_cost,
 676   &cortexa57_vector_cost,
 677   &generic_branch_cost,
 678   &generic_approx_modes,
 679   4, /* memmov_cost.  */
 680   2, /* issue_rate.  */
 681   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 682    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 683   16,   /* function_align.  */
 684   4,    /* jump_align.  */
 685   8,    /* loop_align.  */
 686   2,    /* int_reassoc_width.  */
 687   4,    /* fp_reassoc_width.  */
 688   1,    /* vec_reassoc_width.  */
 689   2,    /* min_div_recip_mul_sf.  */
 690   2,    /* min_div_recip_mul_df.  */
 691   0,    /* max_case_values.  */
 692   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 693   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 694   &generic_prefetch_tune
 695 };
 696
 697
 698
 699 static const struct tune_params exynosm1_tunings =
 700 {
 701   &exynosm1_extra_costs,
 702   &exynosm1_addrcost_table,
 703   &exynosm1_regmove_cost,
 704   &exynosm1_vector_cost,
 705   &generic_branch_cost,
 706   &exynosm1_approx_modes,
 707   4,    /* memmov_cost  */
 708   3,    /* issue_rate  */
 709   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 710   4,    /* function_align.  */
 711   4,    /* jump_align.  */
 712   4,    /* loop_align.  */
 713   2,    /* int_reassoc_width.  */
 714   4,    /* fp_reassoc_width.  */
 715   1,    /* vec_reassoc_width.  */
 716   2,    /* min_div_recip_mul_sf.  */
 717   2,    /* min_div_recip_mul_df.  */
 718   48,   /* max_case_values.  */
 719   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 720   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 721   &exynosm1_prefetch_tune
 722 };
 723
 724 static const struct tune_params thunderxt88_tunings =
 725 {
 726   &thunderx_extra_costs,
 727   &generic_addrcost_table,
 728   &thunderx_regmove_cost,
 729   &thunderx_vector_cost,
 730   &generic_branch_cost,
 731   &generic_approx_modes,
 732   6, /* memmov_cost  */
 733   2, /* issue_rate  */
 734   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 735   8,    /* function_align.  */
 736   8,    /* jump_align.  */
 737   8,    /* loop_align.  */
 738   2,    /* int_reassoc_width.  */
 739   4,    /* fp_reassoc_width.  */
 740   1,    /* vec_reassoc_width.  */
 741   2,    /* min_div_recip_mul_sf.  */
 742   2,    /* min_div_recip_mul_df.  */
 743   0,    /* max_case_values.  */
 744   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 745   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 746   &thunderxt88_prefetch_tune
 747 };
 748
 749 static const struct tune_params thunderx_tunings =
 750 {
 751   &thunderx_extra_costs,
 752   &generic_addrcost_table,
 753   &thunderx_regmove_cost,
 754   &thunderx_vector_cost,
 755   &generic_branch_cost,
 756   &generic_approx_modes,
 757   6, /* memmov_cost  */
 758   2, /* issue_rate  */
 759   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 760   8,    /* function_align.  */
 761   8,    /* jump_align.  */
 762   8,    /* loop_align.  */
 763   2,    /* int_reassoc_width.  */
 764   4,    /* fp_reassoc_width.  */
 765   1,    /* vec_reassoc_width.  */
 766   2,    /* min_div_recip_mul_sf.  */
 767   2,    /* min_div_recip_mul_df.  */
 768   0,    /* max_case_values.  */
 769   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 770   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 771    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 772   &thunderx_prefetch_tune
 773 };
 774
 775 static const struct tune_params xgene1_tunings =
 776 {
 777   &xgene1_extra_costs,
 778   &xgene1_addrcost_table,
 779   &xgene1_regmove_cost,
 780   &xgene1_vector_cost,
 781   &generic_branch_cost,
 782   &xgene1_approx_modes,
 783   6, /* memmov_cost  */
 784   4, /* issue_rate  */
 785   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 786   16,   /* function_align.  */
 787   8,    /* jump_align.  */
 788   16,   /* loop_align.  */
 789   2,    /* int_reassoc_width.  */
 790   4,    /* fp_reassoc_width.  */
 791   1,    /* vec_reassoc_width.  */
 792   2,    /* min_div_recip_mul_sf.  */
 793   2,    /* min_div_recip_mul_df.  */
 794   0,    /* max_case_values.  */
 795   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 796   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 797   &generic_prefetch_tune
 798 };
 799
 800 static const struct tune_params qdf24xx_tunings =
 801 {
 802   &qdf24xx_extra_costs,
 803   &generic_addrcost_table,
 804   &qdf24xx_regmove_cost,
 805   &generic_vector_cost,
 806   &generic_branch_cost,
 807   &generic_approx_modes,
 808   4, /* memmov_cost  */
 809   4, /* issue_rate  */
 810   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 811    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 812   16,   /* function_align.  */
 813   8,    /* jump_align.  */
 814   16,   /* loop_align.  */
 815   2,    /* int_reassoc_width.  */
 816   4,    /* fp_reassoc_width.  */
 817   1,    /* vec_reassoc_width.  */
 818   2,    /* min_div_recip_mul_sf.  */
 819   2,    /* min_div_recip_mul_df.  */
 820   0,    /* max_case_values.  */
 821   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 822   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 823   &qdf24xx_prefetch_tune
 824 };
 825
 826 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 827    for now.  */
 828 static const struct tune_params saphira_tunings =
 829 {
 830   &generic_extra_costs,
 831   &generic_addrcost_table,
 832   &generic_regmove_cost,
 833   &generic_vector_cost,
 834   &generic_branch_cost,
 835   &generic_approx_modes,
 836   4, /* memmov_cost  */
 837   4, /* issue_rate  */
 838   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 839    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 840   16,   /* function_align.  */
 841   8,    /* jump_align.  */
 842   16,   /* loop_align.  */
 843   2,    /* int_reassoc_width.  */
 844   4,    /* fp_reassoc_width.  */
 845   1,    /* vec_reassoc_width.  */
 846   2,    /* min_div_recip_mul_sf.  */
 847   2,    /* min_div_recip_mul_df.  */
 848   0,    /* max_case_values.  */
 849   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 850   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 851   &generic_prefetch_tune
 852 };
 853
 854 static const struct tune_params thunderx2t99_tunings =
 855 {
 856   &thunderx2t99_extra_costs,
 857   &thunderx2t99_addrcost_table,
 858   &thunderx2t99_regmove_cost,
 859   &thunderx2t99_vector_cost,
 860   &generic_branch_cost,
 861   &generic_approx_modes,
 862   4, /* memmov_cost.  */
 863   4, /* issue_rate.  */
 864   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 865    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 866   16,   /* function_align.  */
 867   8,    /* jump_align.  */
 868   16,   /* loop_align.  */
 869   3,    /* int_reassoc_width.  */
 870   2,    /* fp_reassoc_width.  */
 871   2,    /* vec_reassoc_width.  */
 872   2,    /* min_div_recip_mul_sf.  */
 873   2,    /* min_div_recip_mul_df.  */
 874   0,    /* max_case_values.  */
 875   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 876   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 877   &thunderx2t99_prefetch_tune
 878 };
 879
 880 /* Support for fine-grained override of the tuning structures.  */
 881 struct aarch64_tuning_override_function
 882 {
 883   const char* name;
 884   void (*parse_override)(const char*, struct tune_params*);
 885 };
 886
 887 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 888 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 889
 890 static const struct aarch64_tuning_override_function
 891 aarch64_tuning_override_functions[] =
 892 {
 893   { "fuse", aarch64_parse_fuse_string },
 894   { "tune", aarch64_parse_tune_string },
 895   { NULL, NULL }
 896 };
 897
 898 /* A processor implementing AArch64.  */
 899 struct processor
 900 {
 901   const char *const name;
 902   enum aarch64_processor ident;
 903   enum aarch64_processor sched_core;
 904   enum aarch64_arch arch;
 905   unsigned architecture_version;
 906   const unsigned long flags;
 907   const struct tune_params *const tune;
 908 };
 909
 910 /* Architectures implementing AArch64.  */
 911 static const struct processor all_architectures[] =
 912 {
 913 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 914   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 915 #include "aarch64-arches.def"
 916   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 917 };
 918
 919 /* Processor cores implementing AArch64.  */
 920 static const struct processor all_cores[] =
 921 {
 922 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 923   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 924   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 925   FLAGS, &COSTS##_tunings},
 926 #include "aarch64-cores.def"
 927   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 928     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 929   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 930 };
 931
 932
 933 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 934    handling code or by target attributes.  */
 935 static const struct processor *selected_arch;
 936 static const struct processor *selected_cpu;
 937 static const struct processor *selected_tune;
 938
 939 /* The current tuning set.  */
 940 struct tune_params aarch64_tune_params = generic_tunings;
 941
 942 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 943
 944 /* An ISA extension in the co-processor and main instruction set space.  */
 945 struct aarch64_option_extension
 946 {
 947   const char *const name;
 948   const unsigned long flags_on;
 949   const unsigned long flags_off;
 950 };
 951
 952 typedef enum aarch64_cond_code
 953 {
 954   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 955   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 956   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 957 }
 958 aarch64_cc;
 959
 960 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 961
 962 /* The condition codes of the processor, and the inverse function.  */
 963 static const char * const aarch64_condition_codes[] =
 964 {
 965   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 966   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 967 };
 968
 969 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 970 const char *
 971 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 972                         const char * branch_format)
 973 {
 974     rtx_code_label * tmp_label = gen_label_rtx ();
 975     char label_buf[256];
 976     char buffer[128];
 977     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 978                                  CODE_LABEL_NUMBER (tmp_label));
 979     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 980     rtx dest_label = operands[pos_label];
 981     operands[pos_label] = tmp_label;
 982
 983     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 984     output_asm_insn (buffer, operands);
 985
 986     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 987     operands[pos_label] = dest_label;
 988     output_asm_insn (buffer, operands);
 989     return "";
 990 }
 991
 992 void
 993 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 994 {
 995   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 996   if (TARGET_GENERAL_REGS_ONLY)
 997     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 998   else
 999     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1000 }
1001
1002 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1003    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1004    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1005    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1006    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1007    irrespectively of its cost results in bad allocations with many redundant
1008    int<->FP moves which are expensive on various cores.
1009    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1010    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1011    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1012    Otherwise set the allocno class depending on the mode.
1013    The result of this is that it is no longer inefficient to have a higher
1014    memory move cost than the register move cost.
1015 */
1016
1017 static reg_class_t
1018 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1019                                          reg_class_t best_class)
1020 {
1021   machine_mode mode;
1022
1023   if (allocno_class != ALL_REGS)
1024     return allocno_class;
1025
1026   if (best_class != ALL_REGS)
1027     return best_class;
1028
1029   mode = PSEUDO_REGNO_MODE (regno);
1030   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1031 }
1032
1033 static unsigned int
1034 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1035 {
1036   if (GET_MODE_UNIT_SIZE (mode) == 4)
1037     return aarch64_tune_params.min_div_recip_mul_sf;
1038   return aarch64_tune_params.min_div_recip_mul_df;
1039 }
1040
1041 static int
1042 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1043                              machine_mode mode)
1044 {
1045   if (VECTOR_MODE_P (mode))
1046     return aarch64_tune_params.vec_reassoc_width;
1047   if (INTEGRAL_MODE_P (mode))
1048     return aarch64_tune_params.int_reassoc_width;
1049   if (FLOAT_MODE_P (mode))
1050     return aarch64_tune_params.fp_reassoc_width;
1051   return 1;
1052 }
1053
1054 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1055 unsigned
1056 aarch64_dbx_register_number (unsigned regno)
1057 {
1058    if (GP_REGNUM_P (regno))
1059      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1060    else if (regno == SP_REGNUM)
1061      return AARCH64_DWARF_SP;
1062    else if (FP_REGNUM_P (regno))
1063      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1064
1065    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1066       equivalent DWARF register.  */
1067    return DWARF_FRAME_REGISTERS;
1068 }
1069
1070 /* Return TRUE if MODE is any of the large INT modes.  */
1071 static bool
1072 aarch64_vect_struct_mode_p (machine_mode mode)
1073 {
1074   return mode == OImode || mode == CImode || mode == XImode;
1075 }
1076
1077 /* Return TRUE if MODE is any of the vector modes.  */
1078 static bool
1079 aarch64_vector_mode_p (machine_mode mode)
1080 {
1081   return aarch64_vector_mode_supported_p (mode)
1082          || aarch64_vect_struct_mode_p (mode);
1083 }
1084
1085 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1086 static bool
1087 aarch64_array_mode_supported_p (machine_mode mode,
1088                                 unsigned HOST_WIDE_INT nelems)
1089 {
1090   if (TARGET_SIMD
1091       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1092           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1093       && (nelems >= 2 && nelems <= 4))
1094     return true;
1095
1096   return false;
1097 }
1098
1099 /* Implement TARGET_HARD_REGNO_NREGS.  */
1100
1101 static unsigned int
1102 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1103 {
1104   switch (aarch64_regno_regclass (regno))
1105     {
1106     case FP_REGS:
1107     case FP_LO_REGS:
1108       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1109     default:
1110       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1111     }
1112   gcc_unreachable ();
1113 }
1114
1115 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1116
1117 static bool
1118 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1119 {
1120   if (GET_MODE_CLASS (mode) == MODE_CC)
1121     return regno == CC_REGNUM;
1122
1123   if (regno == SP_REGNUM)
1124     /* The purpose of comparing with ptr_mode is to support the
1125        global register variable associated with the stack pointer
1126        register via the syntax of asm ("wsp") in ILP32.  */
1127     return mode == Pmode || mode == ptr_mode;
1128
1129   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1130     return mode == Pmode;
1131
1132   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1133     return true;
1134
1135   if (FP_REGNUM_P (regno))
1136     {
1137       if (aarch64_vect_struct_mode_p (mode))
1138         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1139       else
1140         return true;
1141     }
1142
1143   return false;
1144 }
1145
1146 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1147    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1148    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1149
1150 static bool
1151 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1152 {
1153   return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1154 }
1155
1156 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1157 machine_mode
1158 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1159                                      machine_mode mode)
1160 {
1161   /* Handle modes that fit within single registers.  */
1162   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1163     {
1164       if (GET_MODE_SIZE (mode) >= 4)
1165         return mode;
1166       else
1167         return SImode;
1168     }
1169   /* Fall back to generic for multi-reg and very large modes.  */
1170   else
1171     return choose_hard_reg_mode (regno, nregs, false);
1172 }
1173
1174 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1175    that strcpy from constants will be faster.  */
1176
1177 static HOST_WIDE_INT
1178 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1179 {
1180   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1181     return MAX (align, BITS_PER_WORD);
1182   return align;
1183 }
1184
1185 /* Return true if calls to DECL should be treated as
1186    long-calls (ie called via a register).  */
1187 static bool
1188 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1189 {
1190   return false;
1191 }
1192
1193 /* Return true if calls to symbol-ref SYM should be treated as
1194    long-calls (ie called via a register).  */
1195 bool
1196 aarch64_is_long_call_p (rtx sym)
1197 {
1198   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1199 }
1200
1201 /* Return true if calls to symbol-ref SYM should not go through
1202    plt stubs.  */
1203
1204 bool
1205 aarch64_is_noplt_call_p (rtx sym)
1206 {
1207   const_tree decl = SYMBOL_REF_DECL (sym);
1208
1209   if (flag_pic
1210       && decl
1211       && (!flag_plt
1212           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1213       && !targetm.binds_local_p (decl))
1214     return true;
1215
1216   return false;
1217 }
1218
1219 /* Return true if the offsets to a zero/sign-extract operation
1220    represent an expression that matches an extend operation.  The
1221    operands represent the paramters from
1222
1223    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1224 bool
1225 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1226                                 rtx extract_imm)
1227 {
1228   HOST_WIDE_INT mult_val, extract_val;
1229
1230   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1231     return false;
1232
1233   mult_val = INTVAL (mult_imm);
1234   extract_val = INTVAL (extract_imm);
1235
1236   if (extract_val > 8
1237       && extract_val < GET_MODE_BITSIZE (mode)
1238       && exact_log2 (extract_val & ~7) > 0
1239       && (extract_val & 7) <= 4
1240       && mult_val == (1 << (extract_val & 7)))
1241     return true;
1242
1243   return false;
1244 }
1245
1246 /* Emit an insn that's a simple single-set.  Both the operands must be
1247    known to be valid.  */
1248 inline static rtx_insn *
1249 emit_set_insn (rtx x, rtx y)
1250 {
1251   return emit_insn (gen_rtx_SET (x, y));
1252 }
1253
1254 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1255    return the rtx for register 0 in the proper mode.  */
1256 rtx
1257 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1258 {
1259   machine_mode mode = SELECT_CC_MODE (code, x, y);
1260   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1261
1262   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1263   return cc_reg;
1264 }
1265
1266 /* Build the SYMBOL_REF for __tls_get_addr.  */
1267
1268 static GTY(()) rtx tls_get_addr_libfunc;
1269
1270 rtx
1271 aarch64_tls_get_addr (void)
1272 {
1273   if (!tls_get_addr_libfunc)
1274     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1275   return tls_get_addr_libfunc;
1276 }
1277
1278 /* Return the TLS model to use for ADDR.  */
1279
1280 static enum tls_model
1281 tls_symbolic_operand_type (rtx addr)
1282 {
1283   enum tls_model tls_kind = TLS_MODEL_NONE;
1284   rtx sym, addend;
1285
1286   if (GET_CODE (addr) == CONST)
1287     {
1288       split_const (addr, &sym, &addend);
1289       if (GET_CODE (sym) == SYMBOL_REF)
1290         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1291     }
1292   else if (GET_CODE (addr) == SYMBOL_REF)
1293     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1294
1295   return tls_kind;
1296 }
1297
1298 /* We'll allow lo_sum's in addresses in our legitimate addresses
1299    so that combine would take care of combining addresses where
1300    necessary, but for generation purposes, we'll generate the address
1301    as :
1302    RTL                               Absolute
1303    tmp = hi (symbol_ref);            adrp  x1, foo
1304    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1305                                      nop
1306
1307    PIC                               TLS
1308    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1309    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1310                                      bl   __tls_get_addr
1311                                      nop
1312
1313    Load TLS symbol, depending on TLS mechanism and TLS access model.
1314
1315    Global Dynamic - Traditional TLS:
1316    adrp tmp, :tlsgd:imm
1317    add  dest, tmp, #:tlsgd_lo12:imm
1318    bl   __tls_get_addr
1319
1320    Global Dynamic - TLS Descriptors:
1321    adrp dest, :tlsdesc:imm
1322    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1323    add  dest, dest, #:tlsdesc_lo12:imm
1324    blr  tmp
1325    mrs  tp, tpidr_el0
1326    add  dest, dest, tp
1327
1328    Initial Exec:
1329    mrs  tp, tpidr_el0
1330    adrp tmp, :gottprel:imm
1331    ldr  dest, [tmp, #:gottprel_lo12:imm]
1332    add  dest, dest, tp
1333
1334    Local Exec:
1335    mrs  tp, tpidr_el0
1336    add  t0, tp, #:tprel_hi12:imm, lsl #12
1337    add  t0, t0, #:tprel_lo12_nc:imm
1338 */
1339
1340 static void
1341 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1342                                    enum aarch64_symbol_type type)
1343 {
1344   switch (type)
1345     {
1346     case SYMBOL_SMALL_ABSOLUTE:
1347       {
1348         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1349         rtx tmp_reg = dest;
1350         machine_mode mode = GET_MODE (dest);
1351
1352         gcc_assert (mode == Pmode || mode == ptr_mode);
1353
1354         if (can_create_pseudo_p ())
1355           tmp_reg = gen_reg_rtx (mode);
1356
1357         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1358         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1359         return;
1360       }
1361
1362     case SYMBOL_TINY_ABSOLUTE:
1363       emit_insn (gen_rtx_SET (dest, imm));
1364       return;
1365
1366     case SYMBOL_SMALL_GOT_28K:
1367       {
1368         machine_mode mode = GET_MODE (dest);
1369         rtx gp_rtx = pic_offset_table_rtx;
1370         rtx insn;
1371         rtx mem;
1372
1373         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1374            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1375            decide rtx costs, in which case pic_offset_table_rtx is not
1376            initialized.  For that case no need to generate the first adrp
1377            instruction as the final cost for global variable access is
1378            one instruction.  */
1379         if (gp_rtx != NULL)
1380           {
1381             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1382                using the page base as GOT base, the first page may be wasted,
1383                in the worst scenario, there is only 28K space for GOT).
1384
1385                The generate instruction sequence for accessing global variable
1386                is:
1387
1388                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1389
1390                Only one instruction needed. But we must initialize
1391                pic_offset_table_rtx properly.  We generate initialize insn for
1392                every global access, and allow CSE to remove all redundant.
1393
1394                The final instruction sequences will look like the following
1395                for multiply global variables access.
1396
1397                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1398
1399                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1400                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1401                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1402                  ...  */
1403
1404             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1405             crtl->uses_pic_offset_table = 1;
1406             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1407
1408             if (mode != GET_MODE (gp_rtx))
1409              gp_rtx = gen_lowpart (mode, gp_rtx);
1410
1411           }
1412
1413         if (mode == ptr_mode)
1414           {
1415             if (mode == DImode)
1416               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1417             else
1418               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1419
1420             mem = XVECEXP (SET_SRC (insn), 0, 0);
1421           }
1422         else
1423           {
1424             gcc_assert (mode == Pmode);
1425
1426             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1427             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1428           }
1429
1430         /* The operand is expected to be MEM.  Whenever the related insn
1431            pattern changed, above code which calculate mem should be
1432            updated.  */
1433         gcc_assert (GET_CODE (mem) == MEM);
1434         MEM_READONLY_P (mem) = 1;
1435         MEM_NOTRAP_P (mem) = 1;
1436         emit_insn (insn);
1437         return;
1438       }
1439
1440     case SYMBOL_SMALL_GOT_4G:
1441       {
1442         /* In ILP32, the mode of dest can be either SImode or DImode,
1443            while the got entry is always of SImode size.  The mode of
1444            dest depends on how dest is used: if dest is assigned to a
1445            pointer (e.g. in the memory), it has SImode; it may have
1446            DImode if dest is dereferenced to access the memeory.
1447            This is why we have to handle three different ldr_got_small
1448            patterns here (two patterns for ILP32).  */
1449
1450         rtx insn;
1451         rtx mem;
1452         rtx tmp_reg = dest;
1453         machine_mode mode = GET_MODE (dest);
1454
1455         if (can_create_pseudo_p ())
1456           tmp_reg = gen_reg_rtx (mode);
1457
1458         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1459         if (mode == ptr_mode)
1460           {
1461             if (mode == DImode)
1462               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1463             else
1464               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1465
1466             mem = XVECEXP (SET_SRC (insn), 0, 0);
1467           }
1468         else
1469           {
1470             gcc_assert (mode == Pmode);
1471
1472             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1473             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1474           }
1475
1476         gcc_assert (GET_CODE (mem) == MEM);
1477         MEM_READONLY_P (mem) = 1;
1478         MEM_NOTRAP_P (mem) = 1;
1479         emit_insn (insn);
1480         return;
1481       }
1482
1483     case SYMBOL_SMALL_TLSGD:
1484       {
1485         rtx_insn *insns;
1486         machine_mode mode = GET_MODE (dest);
1487         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1488
1489         start_sequence ();
1490         if (TARGET_ILP32)
1491           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1492         else
1493           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1494         insns = get_insns ();
1495         end_sequence ();
1496
1497         RTL_CONST_CALL_P (insns) = 1;
1498         emit_libcall_block (insns, dest, result, imm);
1499         return;
1500       }
1501
1502     case SYMBOL_SMALL_TLSDESC:
1503       {
1504         machine_mode mode = GET_MODE (dest);
1505         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1506         rtx tp;
1507
1508         gcc_assert (mode == Pmode || mode == ptr_mode);
1509
1510         /* In ILP32, the got entry is always of SImode size.  Unlike
1511            small GOT, the dest is fixed at reg 0.  */
1512         if (TARGET_ILP32)
1513           emit_insn (gen_tlsdesc_small_si (imm));
1514         else
1515           emit_insn (gen_tlsdesc_small_di (imm));
1516         tp = aarch64_load_tp (NULL);
1517
1518         if (mode != Pmode)
1519           tp = gen_lowpart (mode, tp);
1520
1521         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1522         if (REG_P (dest))
1523           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1524         return;
1525       }
1526
1527     case SYMBOL_SMALL_TLSIE:
1528       {
1529         /* In ILP32, the mode of dest can be either SImode or DImode,
1530            while the got entry is always of SImode size.  The mode of
1531            dest depends on how dest is used: if dest is assigned to a
1532            pointer (e.g. in the memory), it has SImode; it may have
1533            DImode if dest is dereferenced to access the memeory.
1534            This is why we have to handle three different tlsie_small
1535            patterns here (two patterns for ILP32).  */
1536         machine_mode mode = GET_MODE (dest);
1537         rtx tmp_reg = gen_reg_rtx (mode);
1538         rtx tp = aarch64_load_tp (NULL);
1539
1540         if (mode == ptr_mode)
1541           {
1542             if (mode == DImode)
1543               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1544             else
1545               {
1546                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1547                 tp = gen_lowpart (mode, tp);
1548               }
1549           }
1550         else
1551           {
1552             gcc_assert (mode == Pmode);
1553             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1554           }
1555
1556         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1557         if (REG_P (dest))
1558           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1559         return;
1560       }
1561
1562     case SYMBOL_TLSLE12:
1563     case SYMBOL_TLSLE24:
1564     case SYMBOL_TLSLE32:
1565     case SYMBOL_TLSLE48:
1566       {
1567         machine_mode mode = GET_MODE (dest);
1568         rtx tp = aarch64_load_tp (NULL);
1569
1570         if (mode != Pmode)
1571           tp = gen_lowpart (mode, tp);
1572
1573         switch (type)
1574           {
1575           case SYMBOL_TLSLE12:
1576             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1577                         (dest, tp, imm));
1578             break;
1579           case SYMBOL_TLSLE24:
1580             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1581                         (dest, tp, imm));
1582           break;
1583           case SYMBOL_TLSLE32:
1584             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1585                         (dest, imm));
1586             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1587                         (dest, dest, tp));
1588           break;
1589           case SYMBOL_TLSLE48:
1590             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1591                         (dest, imm));
1592             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1593                         (dest, dest, tp));
1594             break;
1595           default:
1596             gcc_unreachable ();
1597           }
1598
1599         if (REG_P (dest))
1600           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1601         return;
1602       }
1603
1604     case SYMBOL_TINY_GOT:
1605       emit_insn (gen_ldr_got_tiny (dest, imm));
1606       return;
1607
1608     case SYMBOL_TINY_TLSIE:
1609       {
1610         machine_mode mode = GET_MODE (dest);
1611         rtx tp = aarch64_load_tp (NULL);
1612
1613         if (mode == ptr_mode)
1614           {
1615             if (mode == DImode)
1616               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1617             else
1618               {
1619                 tp = gen_lowpart (mode, tp);
1620                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1621               }
1622           }
1623         else
1624           {
1625             gcc_assert (mode == Pmode);
1626             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1627           }
1628
1629         if (REG_P (dest))
1630           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1631         return;
1632       }
1633
1634     default:
1635       gcc_unreachable ();
1636     }
1637 }
1638
1639 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1640    handle all moves if !can_create_pseudo_p ().  The distinction is
1641    important because, unlike emit_move_insn, the move expanders know
1642    how to force Pmode objects into the constant pool even when the
1643    constant pool address is not itself legitimate.  */
1644 static rtx
1645 aarch64_emit_move (rtx dest, rtx src)
1646 {
1647   return (can_create_pseudo_p ()
1648           ? emit_move_insn (dest, src)
1649           : emit_move_insn_1 (dest, src));
1650 }
1651
1652 /* Split a 128-bit move operation into two 64-bit move operations,
1653    taking care to handle partial overlap of register to register
1654    copies.  Special cases are needed when moving between GP regs and
1655    FP regs.  SRC can be a register, constant or memory; DST a register
1656    or memory.  If either operand is memory it must not have any side
1657    effects.  */
1658 void
1659 aarch64_split_128bit_move (rtx dst, rtx src)
1660 {
1661   rtx dst_lo, dst_hi;
1662   rtx src_lo, src_hi;
1663
1664   machine_mode mode = GET_MODE (dst);
1665
1666   gcc_assert (mode == TImode || mode == TFmode);
1667   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1668   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1669
1670   if (REG_P (dst) && REG_P (src))
1671     {
1672       int src_regno = REGNO (src);
1673       int dst_regno = REGNO (dst);
1674
1675       /* Handle FP <-> GP regs.  */
1676       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1677         {
1678           src_lo = gen_lowpart (word_mode, src);
1679           src_hi = gen_highpart (word_mode, src);
1680
1681           if (mode == TImode)
1682             {
1683               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1684               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1685             }
1686           else
1687             {
1688               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1689               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1690             }
1691           return;
1692         }
1693       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1694         {
1695           dst_lo = gen_lowpart (word_mode, dst);
1696           dst_hi = gen_highpart (word_mode, dst);
1697
1698           if (mode == TImode)
1699             {
1700               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1701               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1702             }
1703           else
1704             {
1705               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1706               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1707             }
1708           return;
1709         }
1710     }
1711
1712   dst_lo = gen_lowpart (word_mode, dst);
1713   dst_hi = gen_highpart (word_mode, dst);
1714   src_lo = gen_lowpart (word_mode, src);
1715   src_hi = gen_highpart_mode (word_mode, mode, src);
1716
1717   /* At most one pairing may overlap.  */
1718   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1719     {
1720       aarch64_emit_move (dst_hi, src_hi);
1721       aarch64_emit_move (dst_lo, src_lo);
1722     }
1723   else
1724     {
1725       aarch64_emit_move (dst_lo, src_lo);
1726       aarch64_emit_move (dst_hi, src_hi);
1727     }
1728 }
1729
1730 bool
1731 aarch64_split_128bit_move_p (rtx dst, rtx src)
1732 {
1733   return (! REG_P (src)
1734           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1735 }
1736
1737 /* Split a complex SIMD combine.  */
1738
1739 void
1740 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1741 {
1742   machine_mode src_mode = GET_MODE (src1);
1743   machine_mode dst_mode = GET_MODE (dst);
1744
1745   gcc_assert (VECTOR_MODE_P (dst_mode));
1746   gcc_assert (register_operand (dst, dst_mode)
1747               && register_operand (src1, src_mode)
1748               && register_operand (src2, src_mode));
1749
1750   rtx (*gen) (rtx, rtx, rtx);
1751
1752   switch (src_mode)
1753     {
1754     case E_V8QImode:
1755       gen = gen_aarch64_simd_combinev8qi;
1756       break;
1757     case E_V4HImode:
1758       gen = gen_aarch64_simd_combinev4hi;
1759       break;
1760     case E_V2SImode:
1761       gen = gen_aarch64_simd_combinev2si;
1762       break;
1763     case E_V4HFmode:
1764       gen = gen_aarch64_simd_combinev4hf;
1765       break;
1766     case E_V2SFmode:
1767       gen = gen_aarch64_simd_combinev2sf;
1768       break;
1769     case E_DImode:
1770       gen = gen_aarch64_simd_combinedi;
1771       break;
1772     case E_DFmode:
1773       gen = gen_aarch64_simd_combinedf;
1774       break;
1775     default:
1776       gcc_unreachable ();
1777     }
1778
1779   emit_insn (gen (dst, src1, src2));
1780   return;
1781 }
1782
1783 /* Split a complex SIMD move.  */
1784
1785 void
1786 aarch64_split_simd_move (rtx dst, rtx src)
1787 {
1788   machine_mode src_mode = GET_MODE (src);
1789   machine_mode dst_mode = GET_MODE (dst);
1790
1791   gcc_assert (VECTOR_MODE_P (dst_mode));
1792
1793   if (REG_P (dst) && REG_P (src))
1794     {
1795       rtx (*gen) (rtx, rtx);
1796
1797       gcc_assert (VECTOR_MODE_P (src_mode));
1798
1799       switch (src_mode)
1800         {
1801         case E_V16QImode:
1802           gen = gen_aarch64_split_simd_movv16qi;
1803           break;
1804         case E_V8HImode:
1805           gen = gen_aarch64_split_simd_movv8hi;
1806           break;
1807         case E_V4SImode:
1808           gen = gen_aarch64_split_simd_movv4si;
1809           break;
1810         case E_V2DImode:
1811           gen = gen_aarch64_split_simd_movv2di;
1812           break;
1813         case E_V8HFmode:
1814           gen = gen_aarch64_split_simd_movv8hf;
1815           break;
1816         case E_V4SFmode:
1817           gen = gen_aarch64_split_simd_movv4sf;
1818           break;
1819         case E_V2DFmode:
1820           gen = gen_aarch64_split_simd_movv2df;
1821           break;
1822         default:
1823           gcc_unreachable ();
1824         }
1825
1826       emit_insn (gen (dst, src));
1827       return;
1828     }
1829 }
1830
1831 bool
1832 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1833                               machine_mode ymode, rtx y)
1834 {
1835   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1836   gcc_assert (r != NULL);
1837   return rtx_equal_p (x, r);
1838 }
1839
1840
1841 static rtx
1842 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1843 {
1844   if (can_create_pseudo_p ())
1845     return force_reg (mode, value);
1846   else
1847     {
1848       x = aarch64_emit_move (x, value);
1849       return x;
1850     }
1851 }
1852
1853
1854 static rtx
1855 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1856                     HOST_WIDE_INT offset)
1857 {
1858   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1859     {
1860       rtx high;
1861       /* Load the full offset into a register.  This
1862          might be improvable in the future.  */
1863       high = GEN_INT (offset);
1864       offset = 0;
1865       high = aarch64_force_temporary (mode, temp, high);
1866       reg = aarch64_force_temporary (mode, temp,
1867                                      gen_rtx_PLUS (mode, high, reg));
1868     }
1869   return plus_constant (mode, reg, offset);
1870 }
1871
1872 static int
1873 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1874                                 scalar_int_mode mode)
1875 {
1876   int i;
1877   unsigned HOST_WIDE_INT val, val2, mask;
1878   int one_match, zero_match;
1879   int num_insns;
1880
1881   val = INTVAL (imm);
1882
1883   if (aarch64_move_imm (val, mode))
1884     {
1885       if (generate)
1886         emit_insn (gen_rtx_SET (dest, imm));
1887       return 1;
1888     }
1889
1890   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1891      (with XXXX non-zero). In that case check to see if the move can be done in
1892      a smaller mode.  */
1893   val2 = val & 0xffffffff;
1894   if (mode == DImode
1895       && aarch64_move_imm (val2, SImode)
1896       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1897     {
1898       if (generate)
1899         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1900
1901       /* Check if we have to emit a second instruction by checking to see
1902          if any of the upper 32 bits of the original DI mode value is set.  */
1903       if (val == val2)
1904         return 1;
1905
1906       i = (val >> 48) ? 48 : 32;
1907
1908       if (generate)
1909          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1910                                     GEN_INT ((val >> i) & 0xffff)));
1911
1912       return 2;
1913     }
1914
1915   if ((val >> 32) == 0 || mode == SImode)
1916     {
1917       if (generate)
1918         {
1919           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1920           if (mode == SImode)
1921             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1922                                        GEN_INT ((val >> 16) & 0xffff)));
1923           else
1924             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1925                                        GEN_INT ((val >> 16) & 0xffff)));
1926         }
1927       return 2;
1928     }
1929
1930   /* Remaining cases are all for DImode.  */
1931
1932   mask = 0xffff;
1933   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1934     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1935   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1936     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1937
1938   if (zero_match != 2 && one_match != 2)
1939     {
1940       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1941          For a 64-bit bitmask try whether changing 16 bits to all ones or
1942          zeroes creates a valid bitmask.  To check any repeated bitmask,
1943          try using 16 bits from the other 32-bit half of val.  */
1944
1945       for (i = 0; i < 64; i += 16, mask <<= 16)
1946         {
1947           val2 = val & ~mask;
1948           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1949             break;
1950           val2 = val | mask;
1951           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1952             break;
1953           val2 = val2 & ~mask;
1954           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1955           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1956             break;
1957         }
1958       if (i != 64)
1959         {
1960           if (generate)
1961             {
1962               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1963               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1964                                          GEN_INT ((val >> i) & 0xffff)));
1965             }
1966           return 2;
1967         }
1968     }
1969
1970   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1971      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1972      otherwise skip zero bits.  */
1973
1974   num_insns = 1;
1975   mask = 0xffff;
1976   val2 = one_match > zero_match ? ~val : val;
1977   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1978
1979   if (generate)
1980     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1981                                            ? (val | ~(mask << i))
1982                                            : (val & (mask << i)))));
1983   for (i += 16; i < 64; i += 16)
1984     {
1985       if ((val2 & (mask << i)) == 0)
1986         continue;
1987       if (generate)
1988         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1989                                    GEN_INT ((val >> i) & 0xffff)));
1990       num_insns ++;
1991     }
1992
1993   return num_insns;
1994 }
1995
1996 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1997    temporary value if necessary.  FRAME_RELATED_P should be true if
1998    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1999    to the generated instructions.  If SCRATCHREG is known to hold
2000    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2001    immediate again.
2002
2003    Since this function may be used to adjust the stack pointer, we must
2004    ensure that it cannot cause transient stack deallocation (for example
2005    by first incrementing SP and then decrementing when adjusting by a
2006    large immediate).  */
2007
2008 static void
2009 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2010                                int scratchreg, HOST_WIDE_INT delta,
2011                                bool frame_related_p, bool emit_move_imm)
2012 {
2013   HOST_WIDE_INT mdelta = abs_hwi (delta);
2014   rtx this_rtx = gen_rtx_REG (mode, regnum);
2015   rtx_insn *insn;
2016
2017   if (!mdelta)
2018     return;
2019
2020   /* Single instruction adjustment.  */
2021   if (aarch64_uimm12_shift (mdelta))
2022     {
2023       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2024       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2025       return;
2026     }
2027
2028   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2029      Only do this if mdelta is not a 16-bit move as adjusting using a move
2030      is better.  */
2031   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2032     {
2033       HOST_WIDE_INT low_off = mdelta & 0xfff;
2034
2035       low_off = delta < 0 ? -low_off : low_off;
2036       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2037       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2038       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2039       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2040       return;
2041     }
2042
2043   /* Emit a move immediate if required and an addition/subtraction.  */
2044   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2045   if (emit_move_imm)
2046     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2047   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2048                               : gen_add2_insn (this_rtx, scratch_rtx));
2049   if (frame_related_p)
2050     {
2051       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2052       rtx adj = plus_constant (mode, this_rtx, delta);
2053       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2054     }
2055 }
2056
2057 static inline void
2058 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2059                       HOST_WIDE_INT delta)
2060 {
2061   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2062 }
2063
2064 static inline void
2065 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2066 {
2067   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2068                                  true, emit_move_imm);
2069 }
2070
2071 static inline void
2072 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2073 {
2074   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2075                                  frame_related_p, true);
2076 }
2077
2078 void
2079 aarch64_expand_mov_immediate (rtx dest, rtx imm)
2080 {
2081   machine_mode mode = GET_MODE (dest);
2082
2083   gcc_assert (mode == SImode || mode == DImode);
2084
2085   /* Check on what type of symbol it is.  */
2086   scalar_int_mode int_mode;
2087   if ((GET_CODE (imm) == SYMBOL_REF
2088        || GET_CODE (imm) == LABEL_REF
2089        || GET_CODE (imm) == CONST)
2090       && is_a <scalar_int_mode> (mode, &int_mode))
2091     {
2092       rtx mem, base, offset;
2093       enum aarch64_symbol_type sty;
2094
2095       /* If we have (const (plus symbol offset)), separate out the offset
2096          before we start classifying the symbol.  */
2097       split_const (imm, &base, &offset);
2098
2099       sty = aarch64_classify_symbol (base, offset);
2100       switch (sty)
2101         {
2102         case SYMBOL_FORCE_TO_MEM:
2103           if (offset != const0_rtx
2104               && targetm.cannot_force_const_mem (int_mode, imm))
2105             {
2106               gcc_assert (can_create_pseudo_p ());
2107               base = aarch64_force_temporary (int_mode, dest, base);
2108               base = aarch64_add_offset (int_mode, NULL, base,
2109                                          INTVAL (offset));
2110               aarch64_emit_move (dest, base);
2111               return;
2112             }
2113
2114           mem = force_const_mem (ptr_mode, imm);
2115           gcc_assert (mem);
2116
2117           /* If we aren't generating PC relative literals, then
2118              we need to expand the literal pool access carefully.
2119              This is something that needs to be done in a number
2120              of places, so could well live as a separate function.  */
2121           if (!aarch64_pcrelative_literal_loads)
2122             {
2123               gcc_assert (can_create_pseudo_p ());
2124               base = gen_reg_rtx (ptr_mode);
2125               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2126               if (ptr_mode != Pmode)
2127                 base = convert_memory_address (Pmode, base);
2128               mem = gen_rtx_MEM (ptr_mode, base);
2129             }
2130
2131           if (int_mode != ptr_mode)
2132             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2133
2134           emit_insn (gen_rtx_SET (dest, mem));
2135
2136           return;
2137
2138         case SYMBOL_SMALL_TLSGD:
2139         case SYMBOL_SMALL_TLSDESC:
2140         case SYMBOL_SMALL_TLSIE:
2141         case SYMBOL_SMALL_GOT_28K:
2142         case SYMBOL_SMALL_GOT_4G:
2143         case SYMBOL_TINY_GOT:
2144         case SYMBOL_TINY_TLSIE:
2145           if (offset != const0_rtx)
2146             {
2147               gcc_assert(can_create_pseudo_p ());
2148               base = aarch64_force_temporary (int_mode, dest, base);
2149               base = aarch64_add_offset (int_mode, NULL, base,
2150                                          INTVAL (offset));
2151               aarch64_emit_move (dest, base);
2152               return;
2153             }
2154           /* FALLTHRU */
2155
2156         case SYMBOL_SMALL_ABSOLUTE:
2157         case SYMBOL_TINY_ABSOLUTE:
2158         case SYMBOL_TLSLE12:
2159         case SYMBOL_TLSLE24:
2160         case SYMBOL_TLSLE32:
2161         case SYMBOL_TLSLE48:
2162           aarch64_load_symref_appropriately (dest, imm, sty);
2163           return;
2164
2165         default:
2166           gcc_unreachable ();
2167         }
2168     }
2169
2170   if (!CONST_INT_P (imm))
2171     {
2172       if (GET_CODE (imm) == HIGH)
2173         emit_insn (gen_rtx_SET (dest, imm));
2174       else
2175         {
2176           rtx mem = force_const_mem (mode, imm);
2177           gcc_assert (mem);
2178           emit_insn (gen_rtx_SET (dest, mem));
2179         }
2180
2181       return;
2182     }
2183
2184   aarch64_internal_mov_immediate (dest, imm, true,
2185                                   as_a <scalar_int_mode> (mode));
2186 }
2187
2188 static bool
2189 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2190                                  tree exp ATTRIBUTE_UNUSED)
2191 {
2192   /* Currently, always true.  */
2193   return true;
2194 }
2195
2196 /* Implement TARGET_PASS_BY_REFERENCE.  */
2197
2198 static bool
2199 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2200                            machine_mode mode,
2201                            const_tree type,
2202                            bool named ATTRIBUTE_UNUSED)
2203 {
2204   HOST_WIDE_INT size;
2205   machine_mode dummymode;
2206   int nregs;
2207
2208   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2209   size = (mode == BLKmode && type)
2210     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2211
2212   /* Aggregates are passed by reference based on their size.  */
2213   if (type && AGGREGATE_TYPE_P (type))
2214     {
2215       size = int_size_in_bytes (type);
2216     }
2217
2218   /* Variable sized arguments are always returned by reference.  */
2219   if (size < 0)
2220     return true;
2221
2222   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2223   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2224                                                &dummymode, &nregs,
2225                                                NULL))
2226     return false;
2227
2228   /* Arguments which are variable sized or larger than 2 registers are
2229      passed by reference unless they are a homogenous floating point
2230      aggregate.  */
2231   return size > 2 * UNITS_PER_WORD;
2232 }
2233
2234 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2235 static bool
2236 aarch64_return_in_msb (const_tree valtype)
2237 {
2238   machine_mode dummy_mode;
2239   int dummy_int;
2240
2241   /* Never happens in little-endian mode.  */
2242   if (!BYTES_BIG_ENDIAN)
2243     return false;
2244
2245   /* Only composite types smaller than or equal to 16 bytes can
2246      be potentially returned in registers.  */
2247   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2248       || int_size_in_bytes (valtype) <= 0
2249       || int_size_in_bytes (valtype) > 16)
2250     return false;
2251
2252   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2253      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2254      is always passed/returned in the least significant bits of fp/simd
2255      register(s).  */
2256   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2257                                                &dummy_mode, &dummy_int, NULL))
2258     return false;
2259
2260   return true;
2261 }
2262
2263 /* Implement TARGET_FUNCTION_VALUE.
2264    Define how to find the value returned by a function.  */
2265
2266 static rtx
2267 aarch64_function_value (const_tree type, const_tree func,
2268                         bool outgoing ATTRIBUTE_UNUSED)
2269 {
2270   machine_mode mode;
2271   int unsignedp;
2272   int count;
2273   machine_mode ag_mode;
2274
2275   mode = TYPE_MODE (type);
2276   if (INTEGRAL_TYPE_P (type))
2277     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2278
2279   if (aarch64_return_in_msb (type))
2280     {
2281       HOST_WIDE_INT size = int_size_in_bytes (type);
2282
2283       if (size % UNITS_PER_WORD != 0)
2284         {
2285           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2286           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2287         }
2288     }
2289
2290   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2291                                                &ag_mode, &count, NULL))
2292     {
2293       if (!aarch64_composite_type_p (type, mode))
2294         {
2295           gcc_assert (count == 1 && mode == ag_mode);
2296           return gen_rtx_REG (mode, V0_REGNUM);
2297         }
2298       else
2299         {
2300           int i;
2301           rtx par;
2302
2303           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2304           for (i = 0; i < count; i++)
2305             {
2306               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2307               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2308                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2309               XVECEXP (par, 0, i) = tmp;
2310             }
2311           return par;
2312         }
2313     }
2314   else
2315     return gen_rtx_REG (mode, R0_REGNUM);
2316 }
2317
2318 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2319    Return true if REGNO is the number of a hard register in which the values
2320    of called function may come back.  */
2321
2322 static bool
2323 aarch64_function_value_regno_p (const unsigned int regno)
2324 {
2325   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2326      of 16-byte return values are: 128-bit integers and 16-byte small
2327      structures (excluding homogeneous floating-point aggregates).  */
2328   if (regno == R0_REGNUM || regno == R1_REGNUM)
2329     return true;
2330
2331   /* Up to four fp/simd registers can return a function value, e.g. a
2332      homogeneous floating-point aggregate having four members.  */
2333   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2334     return TARGET_FLOAT;
2335
2336   return false;
2337 }
2338
2339 /* Implement TARGET_RETURN_IN_MEMORY.
2340
2341    If the type T of the result of a function is such that
2342      void func (T arg)
2343    would require that arg be passed as a value in a register (or set of
2344    registers) according to the parameter passing rules, then the result
2345    is returned in the same registers as would be used for such an
2346    argument.  */
2347
2348 static bool
2349 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2350 {
2351   HOST_WIDE_INT size;
2352   machine_mode ag_mode;
2353   int count;
2354
2355   if (!AGGREGATE_TYPE_P (type)
2356       && TREE_CODE (type) != COMPLEX_TYPE
2357       && TREE_CODE (type) != VECTOR_TYPE)
2358     /* Simple scalar types always returned in registers.  */
2359     return false;
2360
2361   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2362                                                type,
2363                                                &ag_mode,
2364                                                &count,
2365                                                NULL))
2366     return false;
2367
2368   /* Types larger than 2 registers returned in memory.  */
2369   size = int_size_in_bytes (type);
2370   return (size < 0 || size > 2 * UNITS_PER_WORD);
2371 }
2372
2373 static bool
2374 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2375                                const_tree type, int *nregs)
2376 {
2377   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2378   return aarch64_vfp_is_call_or_return_candidate (mode,
2379                                                   type,
2380                                                   &pcum->aapcs_vfp_rmode,
2381                                                   nregs,
2382                                                   NULL);
2383 }
2384
2385 /* Given MODE and TYPE of a function argument, return the alignment in
2386    bits.  The idea is to suppress any stronger alignment requested by
2387    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2388    This is a helper function for local use only.  */
2389
2390 static unsigned int
2391 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2392 {
2393   if (!type)
2394     return GET_MODE_ALIGNMENT (mode);
2395
2396   if (integer_zerop (TYPE_SIZE (type)))
2397     return 0;
2398
2399   gcc_assert (TYPE_MODE (type) == mode);
2400
2401   if (!AGGREGATE_TYPE_P (type))
2402     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2403
2404   if (TREE_CODE (type) == ARRAY_TYPE)
2405     return TYPE_ALIGN (TREE_TYPE (type));
2406
2407   unsigned int alignment = 0;
2408   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2409     if (TREE_CODE (field) == FIELD_DECL)
2410       alignment = std::max (alignment, DECL_ALIGN (field));
2411
2412   return alignment;
2413 }
2414
2415 /* Layout a function argument according to the AAPCS64 rules.  The rule
2416    numbers refer to the rule numbers in the AAPCS64.  */
2417
2418 static void
2419 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2420                     const_tree type,
2421                     bool named ATTRIBUTE_UNUSED)
2422 {
2423   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2424   int ncrn, nvrn, nregs;
2425   bool allocate_ncrn, allocate_nvrn;
2426   HOST_WIDE_INT size;
2427
2428   /* We need to do this once per argument.  */
2429   if (pcum->aapcs_arg_processed)
2430     return;
2431
2432   pcum->aapcs_arg_processed = true;
2433
2434   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2435   size
2436     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2437                 UNITS_PER_WORD);
2438
2439   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2440   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2441                                                  mode,
2442                                                  type,
2443                                                  &nregs);
2444
2445   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2446      The following code thus handles passing by SIMD/FP registers first.  */
2447
2448   nvrn = pcum->aapcs_nvrn;
2449
2450   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2451      and homogenous short-vector aggregates (HVA).  */
2452   if (allocate_nvrn)
2453     {
2454       if (!TARGET_FLOAT)
2455         aarch64_err_no_fpadvsimd (mode, "argument");
2456
2457       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2458         {
2459           pcum->aapcs_nextnvrn = nvrn + nregs;
2460           if (!aarch64_composite_type_p (type, mode))
2461             {
2462               gcc_assert (nregs == 1);
2463               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2464             }
2465           else
2466             {
2467               rtx par;
2468               int i;
2469               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2470               for (i = 0; i < nregs; i++)
2471                 {
2472                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2473                                          V0_REGNUM + nvrn + i);
2474                   tmp = gen_rtx_EXPR_LIST
2475                     (VOIDmode, tmp,
2476                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2477                   XVECEXP (par, 0, i) = tmp;
2478                 }
2479               pcum->aapcs_reg = par;
2480             }
2481           return;
2482         }
2483       else
2484         {
2485           /* C.3 NSRN is set to 8.  */
2486           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2487           goto on_stack;
2488         }
2489     }
2490
2491   ncrn = pcum->aapcs_ncrn;
2492   nregs = size / UNITS_PER_WORD;
2493
2494   /* C6 - C9.  though the sign and zero extension semantics are
2495      handled elsewhere.  This is the case where the argument fits
2496      entirely general registers.  */
2497   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2498     {
2499
2500       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2501
2502       /* C.8 if the argument has an alignment of 16 then the NGRN is
2503          rounded up to the next even number.  */
2504       if (nregs == 2
2505           && ncrn % 2
2506           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2507              comparison is there because for > 16 * BITS_PER_UNIT
2508              alignment nregs should be > 2 and therefore it should be
2509              passed by reference rather than value.  */
2510           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2511         {
2512           ++ncrn;
2513           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2514         }
2515
2516       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2517          A reg is still generated for it, but the caller should be smart
2518          enough not to use it.  */
2519       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2520         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2521       else
2522         {
2523           rtx par;
2524           int i;
2525
2526           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2527           for (i = 0; i < nregs; i++)
2528             {
2529               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2530               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2531                                        GEN_INT (i * UNITS_PER_WORD));
2532               XVECEXP (par, 0, i) = tmp;
2533             }
2534           pcum->aapcs_reg = par;
2535         }
2536
2537       pcum->aapcs_nextncrn = ncrn + nregs;
2538       return;
2539     }
2540
2541   /* C.11  */
2542   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2543
2544   /* The argument is passed on stack; record the needed number of words for
2545      this argument and align the total size if necessary.  */
2546 on_stack:
2547   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2548
2549   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2550     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2551                                        16 / UNITS_PER_WORD);
2552   return;
2553 }
2554
2555 /* Implement TARGET_FUNCTION_ARG.  */
2556
2557 static rtx
2558 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2559                       const_tree type, bool named)
2560 {
2561   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2562   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2563
2564   if (mode == VOIDmode)
2565     return NULL_RTX;
2566
2567   aarch64_layout_arg (pcum_v, mode, type, named);
2568   return pcum->aapcs_reg;
2569 }
2570
2571 void
2572 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2573                            const_tree fntype ATTRIBUTE_UNUSED,
2574                            rtx libname ATTRIBUTE_UNUSED,
2575                            const_tree fndecl ATTRIBUTE_UNUSED,
2576                            unsigned n_named ATTRIBUTE_UNUSED)
2577 {
2578   pcum->aapcs_ncrn = 0;
2579   pcum->aapcs_nvrn = 0;
2580   pcum->aapcs_nextncrn = 0;
2581   pcum->aapcs_nextnvrn = 0;
2582   pcum->pcs_variant = ARM_PCS_AAPCS64;
2583   pcum->aapcs_reg = NULL_RTX;
2584   pcum->aapcs_arg_processed = false;
2585   pcum->aapcs_stack_words = 0;
2586   pcum->aapcs_stack_size = 0;
2587
2588   if (!TARGET_FLOAT
2589       && fndecl && TREE_PUBLIC (fndecl)
2590       && fntype && fntype != error_mark_node)
2591     {
2592       const_tree type = TREE_TYPE (fntype);
2593       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2594       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2595       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2596                                                    &mode, &nregs, NULL))
2597         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2598     }
2599   return;
2600 }
2601
2602 static void
2603 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2604                               machine_mode mode,
2605                               const_tree type,
2606                               bool named)
2607 {
2608   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2609   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2610     {
2611       aarch64_layout_arg (pcum_v, mode, type, named);
2612       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2613                   != (pcum->aapcs_stack_words != 0));
2614       pcum->aapcs_arg_processed = false;
2615       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2616       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2617       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2618       pcum->aapcs_stack_words = 0;
2619       pcum->aapcs_reg = NULL_RTX;
2620     }
2621 }
2622
2623 bool
2624 aarch64_function_arg_regno_p (unsigned regno)
2625 {
2626   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2627           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2628 }
2629
2630 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2631    PARM_BOUNDARY bits of alignment, but will be given anything up
2632    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2633    that both before and after the layout of each argument, the Next
2634    Stacked Argument Address (NSAA) will have a minimum alignment of
2635    8 bytes.  */
2636
2637 static unsigned int
2638 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2639 {
2640   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2641   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2642 }
2643
2644 /* Implement TARGET_FUNCTION_ARG_PADDING.
2645
2646    Small aggregate types are placed in the lowest memory address.
2647
2648    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2649
2650 static pad_direction
2651 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2652 {
2653   /* On little-endian targets, the least significant byte of every stack
2654      argument is passed at the lowest byte address of the stack slot.  */
2655   if (!BYTES_BIG_ENDIAN)
2656     return PAD_UPWARD;
2657
2658   /* Otherwise, integral, floating-point and pointer types are padded downward:
2659      the least significant byte of a stack argument is passed at the highest
2660      byte address of the stack slot.  */
2661   if (type
2662       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2663          || POINTER_TYPE_P (type))
2664       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2665     return PAD_DOWNWARD;
2666
2667   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2668   return PAD_UPWARD;
2669 }
2670
2671 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2672
2673    It specifies padding for the last (may also be the only)
2674    element of a block move between registers and memory.  If
2675    assuming the block is in the memory, padding upward means that
2676    the last element is padded after its highest significant byte,
2677    while in downward padding, the last element is padded at the
2678    its least significant byte side.
2679
2680    Small aggregates and small complex types are always padded
2681    upwards.
2682
2683    We don't need to worry about homogeneous floating-point or
2684    short-vector aggregates; their move is not affected by the
2685    padding direction determined here.  Regardless of endianness,
2686    each element of such an aggregate is put in the least
2687    significant bits of a fp/simd register.
2688
2689    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2690    register has useful data, and return the opposite if the most
2691    significant byte does.  */
2692
2693 bool
2694 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2695                      bool first ATTRIBUTE_UNUSED)
2696 {
2697
2698   /* Small composite types are always padded upward.  */
2699   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2700     {
2701       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2702                             : GET_MODE_SIZE (mode));
2703       if (size < 2 * UNITS_PER_WORD)
2704         return true;
2705     }
2706
2707   /* Otherwise, use the default padding.  */
2708   return !BYTES_BIG_ENDIAN;
2709 }
2710
2711 static scalar_int_mode
2712 aarch64_libgcc_cmp_return_mode (void)
2713 {
2714   return SImode;
2715 }
2716
2717 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2718
2719 /* We use the 12-bit shifted immediate arithmetic instructions so values
2720    must be multiple of (1 << 12), i.e. 4096.  */
2721 #define ARITH_FACTOR 4096
2722
2723 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2724 #error Cannot use simple address calculation for stack probing
2725 #endif
2726
2727 /* The pair of scratch registers used for stack probing.  */
2728 #define PROBE_STACK_FIRST_REG  9
2729 #define PROBE_STACK_SECOND_REG 10
2730
2731 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2732    inclusive.  These are offsets from the current stack pointer.  */
2733
2734 static void
2735 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2736 {
2737   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2738
2739   /* See the same assertion on PROBE_INTERVAL above.  */
2740   gcc_assert ((first % ARITH_FACTOR) == 0);
2741
2742   /* See if we have a constant small number of probes to generate.  If so,
2743      that's the easy case.  */
2744   if (size <= PROBE_INTERVAL)
2745     {
2746       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2747
2748       emit_set_insn (reg1,
2749                      plus_constant (Pmode,
2750                                     stack_pointer_rtx, -(first + base)));
2751       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2752     }
2753
2754   /* The run-time loop is made up of 8 insns in the generic case while the
2755      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2756   else if (size <= 4 * PROBE_INTERVAL)
2757     {
2758       HOST_WIDE_INT i, rem;
2759
2760       emit_set_insn (reg1,
2761                      plus_constant (Pmode,
2762                                     stack_pointer_rtx,
2763                                     -(first + PROBE_INTERVAL)));
2764       emit_stack_probe (reg1);
2765
2766       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2767          it exceeds SIZE.  If only two probes are needed, this will not
2768          generate any code.  Then probe at FIRST + SIZE.  */
2769       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2770         {
2771           emit_set_insn (reg1,
2772                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2773           emit_stack_probe (reg1);
2774         }
2775
2776       rem = size - (i - PROBE_INTERVAL);
2777       if (rem > 256)
2778         {
2779           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2780
2781           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2782           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2783         }
2784       else
2785         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2786     }
2787
2788   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2789      extra careful with variables wrapping around because we might be at
2790      the very top (or the very bottom) of the address space and we have
2791      to be able to handle this case properly; in particular, we use an
2792      equality test for the loop condition.  */
2793   else
2794     {
2795       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2796
2797       /* Step 1: round SIZE to the previous multiple of the interval.  */
2798
2799       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2800
2801
2802       /* Step 2: compute initial and final value of the loop counter.  */
2803
2804       /* TEST_ADDR = SP + FIRST.  */
2805       emit_set_insn (reg1,
2806                      plus_constant (Pmode, stack_pointer_rtx, -first));
2807
2808       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2809       HOST_WIDE_INT adjustment = - (first + rounded_size);
2810       if (! aarch64_uimm12_shift (adjustment))
2811         {
2812           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2813                                           true, Pmode);
2814           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2815         }
2816       else
2817         {
2818           emit_set_insn (reg2,
2819                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2820         }
2821
2822       /* Step 3: the loop
2823
2824          do
2825            {
2826              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2827              probe at TEST_ADDR
2828            }
2829          while (TEST_ADDR != LAST_ADDR)
2830
2831          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2832          until it is equal to ROUNDED_SIZE.  */
2833
2834       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2835
2836
2837       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2838          that SIZE is equal to ROUNDED_SIZE.  */
2839
2840       if (size != rounded_size)
2841         {
2842           HOST_WIDE_INT rem = size - rounded_size;
2843
2844           if (rem > 256)
2845             {
2846               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2847
2848               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2849               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2850             }
2851           else
2852             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2853         }
2854     }
2855
2856   /* Make sure nothing is scheduled before we are done.  */
2857   emit_insn (gen_blockage ());
2858 }
2859
2860 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2861    absolute addresses.  */
2862
2863 const char *
2864 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2865 {
2866   static int labelno = 0;
2867   char loop_lab[32];
2868   rtx xops[2];
2869
2870   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2871
2872   /* Loop.  */
2873   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2874
2875   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2876   xops[0] = reg1;
2877   xops[1] = GEN_INT (PROBE_INTERVAL);
2878   output_asm_insn ("sub\t%0, %0, %1", xops);
2879
2880   /* Probe at TEST_ADDR.  */
2881   output_asm_insn ("str\txzr, [%0]", xops);
2882
2883   /* Test if TEST_ADDR == LAST_ADDR.  */
2884   xops[1] = reg2;
2885   output_asm_insn ("cmp\t%0, %1", xops);
2886
2887   /* Branch.  */
2888   fputs ("\tb.ne\t", asm_out_file);
2889   assemble_name_raw (asm_out_file, loop_lab);
2890   fputc ('\n', asm_out_file);
2891
2892   return "";
2893 }
2894
2895 /* Mark the registers that need to be saved by the callee and calculate
2896    the size of the callee-saved registers area and frame record (both FP
2897    and LR may be omitted).  */
2898 static void
2899 aarch64_layout_frame (void)
2900 {
2901   HOST_WIDE_INT offset = 0;
2902   int regno, last_fp_reg = INVALID_REGNUM;
2903
2904   if (reload_completed && cfun->machine->frame.laid_out)
2905     return;
2906
2907   /* Force a frame chain for EH returns so the return address is at FP+8.  */
2908   cfun->machine->frame.emit_frame_chain
2909     = frame_pointer_needed || crtl->calls_eh_return;
2910
2911   /* Emit a frame chain if the frame pointer is enabled.
2912      If -momit-leaf-frame-pointer is used, do not use a frame chain
2913      in leaf functions which do not use LR.  */
2914   if (flag_omit_frame_pointer == 2
2915       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
2916            && !df_regs_ever_live_p (LR_REGNUM)))
2917     cfun->machine->frame.emit_frame_chain = true;
2918
2919 #define SLOT_NOT_REQUIRED (-2)
2920 #define SLOT_REQUIRED     (-1)
2921
2922   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2923   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2924
2925   /* First mark all the registers that really need to be saved...  */
2926   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2927     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2928
2929   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2930     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2931
2932   /* ... that includes the eh data registers (if needed)...  */
2933   if (crtl->calls_eh_return)
2934     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2935       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2936         = SLOT_REQUIRED;
2937
2938   /* ... and any callee saved register that dataflow says is live.  */
2939   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2940     if (df_regs_ever_live_p (regno)
2941         && (regno == R30_REGNUM
2942             || !call_used_regs[regno]))
2943       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2944
2945   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2946     if (df_regs_ever_live_p (regno)
2947         && !call_used_regs[regno])
2948       {
2949         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2950         last_fp_reg = regno;
2951       }
2952
2953   if (cfun->machine->frame.emit_frame_chain)
2954     {
2955       /* FP and LR are placed in the linkage record.  */
2956       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2957       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2958       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2959       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2960       offset = 2 * UNITS_PER_WORD;
2961     }
2962
2963   /* Now assign stack slots for them.  */
2964   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2965     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2966       {
2967         cfun->machine->frame.reg_offset[regno] = offset;
2968         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2969           cfun->machine->frame.wb_candidate1 = regno;
2970         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2971           cfun->machine->frame.wb_candidate2 = regno;
2972         offset += UNITS_PER_WORD;
2973       }
2974
2975   HOST_WIDE_INT max_int_offset = offset;
2976   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2977   bool has_align_gap = offset != max_int_offset;
2978
2979   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2980     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2981       {
2982         /* If there is an alignment gap between integer and fp callee-saves,
2983            allocate the last fp register to it if possible.  */
2984         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2985           {
2986             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2987             break;
2988           }
2989
2990         cfun->machine->frame.reg_offset[regno] = offset;
2991         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2992           cfun->machine->frame.wb_candidate1 = regno;
2993         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2994                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2995           cfun->machine->frame.wb_candidate2 = regno;
2996         offset += UNITS_PER_WORD;
2997       }
2998
2999   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3000
3001   cfun->machine->frame.saved_regs_size = offset;
3002
3003   HOST_WIDE_INT varargs_and_saved_regs_size
3004     = offset + cfun->machine->frame.saved_varargs_size;
3005
3006   cfun->machine->frame.hard_fp_offset
3007     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
3008                 STACK_BOUNDARY / BITS_PER_UNIT);
3009
3010   cfun->machine->frame.frame_size
3011     = ROUND_UP (cfun->machine->frame.hard_fp_offset
3012                 + crtl->outgoing_args_size,
3013                 STACK_BOUNDARY / BITS_PER_UNIT);
3014
3015   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
3016
3017   cfun->machine->frame.initial_adjust = 0;
3018   cfun->machine->frame.final_adjust = 0;
3019   cfun->machine->frame.callee_adjust = 0;
3020   cfun->machine->frame.callee_offset = 0;
3021
3022   HOST_WIDE_INT max_push_offset = 0;
3023   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
3024     max_push_offset = 512;
3025   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3026     max_push_offset = 256;
3027
3028   if (cfun->machine->frame.frame_size < max_push_offset
3029       && crtl->outgoing_args_size == 0)
3030     {
3031       /* Simple, small frame with no outgoing arguments:
3032          stp reg1, reg2, [sp, -frame_size]!
3033          stp reg3, reg4, [sp, 16]  */
3034       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3035     }
3036   else if ((crtl->outgoing_args_size
3037             + cfun->machine->frame.saved_regs_size < 512)
3038            && !(cfun->calls_alloca
3039                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3040     {
3041       /* Frame with small outgoing arguments:
3042          sub sp, sp, frame_size
3043          stp reg1, reg2, [sp, outgoing_args_size]
3044          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3045       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3046       cfun->machine->frame.callee_offset
3047         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3048     }
3049   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3050     {
3051       /* Frame with large outgoing arguments but a small local area:
3052          stp reg1, reg2, [sp, -hard_fp_offset]!
3053          stp reg3, reg4, [sp, 16]
3054          sub sp, sp, outgoing_args_size  */
3055       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3056       cfun->machine->frame.final_adjust
3057         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3058     }
3059   else
3060     {
3061       /* Frame with large local area and outgoing arguments using frame pointer:
3062          sub sp, sp, hard_fp_offset
3063          stp x29, x30, [sp, 0]
3064          add x29, sp, 0
3065          stp reg3, reg4, [sp, 16]
3066          sub sp, sp, outgoing_args_size  */
3067       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3068       cfun->machine->frame.final_adjust
3069         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3070     }
3071
3072   cfun->machine->frame.laid_out = true;
3073 }
3074
3075 /* Return true if the register REGNO is saved on entry to
3076    the current function.  */
3077
3078 static bool
3079 aarch64_register_saved_on_entry (int regno)
3080 {
3081   return cfun->machine->frame.reg_offset[regno] >= 0;
3082 }
3083
3084 /* Return the next register up from REGNO up to LIMIT for the callee
3085    to save.  */
3086
3087 static unsigned
3088 aarch64_next_callee_save (unsigned regno, unsigned limit)
3089 {
3090   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3091     regno ++;
3092   return regno;
3093 }
3094
3095 /* Push the register number REGNO of mode MODE to the stack with write-back
3096    adjusting the stack by ADJUSTMENT.  */
3097
3098 static void
3099 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3100                            HOST_WIDE_INT adjustment)
3101  {
3102   rtx base_rtx = stack_pointer_rtx;
3103   rtx insn, reg, mem;
3104
3105   reg = gen_rtx_REG (mode, regno);
3106   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3107                             plus_constant (Pmode, base_rtx, -adjustment));
3108   mem = gen_frame_mem (mode, mem);
3109
3110   insn = emit_move_insn (mem, reg);
3111   RTX_FRAME_RELATED_P (insn) = 1;
3112 }
3113
3114 /* Generate and return an instruction to store the pair of registers
3115    REG and REG2 of mode MODE to location BASE with write-back adjusting
3116    the stack location BASE by ADJUSTMENT.  */
3117
3118 static rtx
3119 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3120                           HOST_WIDE_INT adjustment)
3121 {
3122   switch (mode)
3123     {
3124     case E_DImode:
3125       return gen_storewb_pairdi_di (base, base, reg, reg2,
3126                                     GEN_INT (-adjustment),
3127                                     GEN_INT (UNITS_PER_WORD - adjustment));
3128     case E_DFmode:
3129       return gen_storewb_pairdf_di (base, base, reg, reg2,
3130                                     GEN_INT (-adjustment),
3131                                     GEN_INT (UNITS_PER_WORD - adjustment));
3132     default:
3133       gcc_unreachable ();
3134     }
3135 }
3136
3137 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3138    stack pointer by ADJUSTMENT.  */
3139
3140 static void
3141 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3142 {
3143   rtx_insn *insn;
3144   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3145
3146   if (regno2 == INVALID_REGNUM)
3147     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3148
3149   rtx reg1 = gen_rtx_REG (mode, regno1);
3150   rtx reg2 = gen_rtx_REG (mode, regno2);
3151
3152   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3153                                               reg2, adjustment));
3154   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3155   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3156   RTX_FRAME_RELATED_P (insn) = 1;
3157 }
3158
3159 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3160    adjusting it by ADJUSTMENT afterwards.  */
3161
3162 static rtx
3163 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3164                          HOST_WIDE_INT adjustment)
3165 {
3166   switch (mode)
3167     {
3168     case E_DImode:
3169       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3170                                    GEN_INT (UNITS_PER_WORD));
3171     case E_DFmode:
3172       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3173                                    GEN_INT (UNITS_PER_WORD));
3174     default:
3175       gcc_unreachable ();
3176     }
3177 }
3178
3179 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3180    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3181    into CFI_OPS.  */
3182
3183 static void
3184 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3185                   rtx *cfi_ops)
3186 {
3187   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3188   rtx reg1 = gen_rtx_REG (mode, regno1);
3189
3190   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3191
3192   if (regno2 == INVALID_REGNUM)
3193     {
3194       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3195       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3196       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3197     }
3198   else
3199     {
3200       rtx reg2 = gen_rtx_REG (mode, regno2);
3201       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3202       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3203                                           reg2, adjustment));
3204     }
3205 }
3206
3207 /* Generate and return a store pair instruction of mode MODE to store
3208    register REG1 to MEM1 and register REG2 to MEM2.  */
3209
3210 static rtx
3211 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3212                         rtx reg2)
3213 {
3214   switch (mode)
3215     {
3216     case E_DImode:
3217       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3218
3219     case E_DFmode:
3220       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3221
3222     default:
3223       gcc_unreachable ();
3224     }
3225 }
3226
3227 /* Generate and regurn a load pair isntruction of mode MODE to load register
3228    REG1 from MEM1 and register REG2 from MEM2.  */
3229
3230 static rtx
3231 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3232                        rtx mem2)
3233 {
3234   switch (mode)
3235     {
3236     case E_DImode:
3237       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3238
3239     case E_DFmode:
3240       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3241
3242     default:
3243       gcc_unreachable ();
3244     }
3245 }
3246
3247 /* Return TRUE if return address signing should be enabled for the current
3248    function, otherwise return FALSE.  */
3249
3250 bool
3251 aarch64_return_address_signing_enabled (void)
3252 {
3253   /* This function should only be called after frame laid out.   */
3254   gcc_assert (cfun->machine->frame.laid_out);
3255
3256   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3257      if it's LR is pushed onto stack.  */
3258   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3259           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3260               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3261 }
3262
3263 /* Emit code to save the callee-saved registers from register number START
3264    to LIMIT to the stack at the location starting at offset START_OFFSET,
3265    skipping any write-back candidates if SKIP_WB is true.  */
3266
3267 static void
3268 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3269                            unsigned start, unsigned limit, bool skip_wb)
3270 {
3271   rtx_insn *insn;
3272   unsigned regno;
3273   unsigned regno2;
3274
3275   for (regno = aarch64_next_callee_save (start, limit);
3276        regno <= limit;
3277        regno = aarch64_next_callee_save (regno + 1, limit))
3278     {
3279       rtx reg, mem;
3280       HOST_WIDE_INT offset;
3281
3282       if (skip_wb
3283           && (regno == cfun->machine->frame.wb_candidate1
3284               || regno == cfun->machine->frame.wb_candidate2))
3285         continue;
3286
3287       if (cfun->machine->reg_is_wrapped_separately[regno])
3288        continue;
3289
3290       reg = gen_rtx_REG (mode, regno);
3291       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3292       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3293                                                 offset));
3294
3295       regno2 = aarch64_next_callee_save (regno + 1, limit);
3296
3297       if (regno2 <= limit
3298           && !cfun->machine->reg_is_wrapped_separately[regno2]
3299           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3300               == cfun->machine->frame.reg_offset[regno2]))
3301
3302         {
3303           rtx reg2 = gen_rtx_REG (mode, regno2);
3304           rtx mem2;
3305
3306           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3307           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3308                                                      offset));
3309           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3310                                                     reg2));
3311
3312           /* The first part of a frame-related parallel insn is
3313              always assumed to be relevant to the frame
3314              calculations; subsequent parts, are only
3315              frame-related if explicitly marked.  */
3316           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3317           regno = regno2;
3318         }
3319       else
3320         insn = emit_move_insn (mem, reg);
3321
3322       RTX_FRAME_RELATED_P (insn) = 1;
3323     }
3324 }
3325
3326 /* Emit code to restore the callee registers of mode MODE from register
3327    number START up to and including LIMIT.  Restore from the stack offset
3328    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3329    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3330
3331 static void
3332 aarch64_restore_callee_saves (machine_mode mode,
3333                               HOST_WIDE_INT start_offset, unsigned start,
3334                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3335 {
3336   rtx base_rtx = stack_pointer_rtx;
3337   unsigned regno;
3338   unsigned regno2;
3339   HOST_WIDE_INT offset;
3340
3341   for (regno = aarch64_next_callee_save (start, limit);
3342        regno <= limit;
3343        regno = aarch64_next_callee_save (regno + 1, limit))
3344     {
3345       if (cfun->machine->reg_is_wrapped_separately[regno])
3346        continue;
3347
3348       rtx reg, mem;
3349
3350       if (skip_wb
3351           && (regno == cfun->machine->frame.wb_candidate1
3352               || regno == cfun->machine->frame.wb_candidate2))
3353         continue;
3354
3355       reg = gen_rtx_REG (mode, regno);
3356       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3357       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3358
3359       regno2 = aarch64_next_callee_save (regno + 1, limit);
3360
3361       if (regno2 <= limit
3362           && !cfun->machine->reg_is_wrapped_separately[regno2]
3363           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3364               == cfun->machine->frame.reg_offset[regno2]))
3365         {
3366           rtx reg2 = gen_rtx_REG (mode, regno2);
3367           rtx mem2;
3368
3369           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3370           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3371           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3372
3373           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3374           regno = regno2;
3375         }
3376       else
3377         emit_move_insn (reg, mem);
3378       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3379     }
3380 }
3381
3382 static inline bool
3383 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3384                                HOST_WIDE_INT offset)
3385 {
3386   return offset >= -256 && offset < 256;
3387 }
3388
3389 static inline bool
3390 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3391 {
3392   return (offset >= 0
3393           && offset < 4096 * GET_MODE_SIZE (mode)
3394           && offset % GET_MODE_SIZE (mode) == 0);
3395 }
3396
3397 bool
3398 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3399 {
3400   return (offset >= -64 * GET_MODE_SIZE (mode)
3401           && offset < 64 * GET_MODE_SIZE (mode)
3402           && offset % GET_MODE_SIZE (mode) == 0);
3403 }
3404
3405 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3406
3407 static sbitmap
3408 aarch64_get_separate_components (void)
3409 {
3410   aarch64_layout_frame ();
3411
3412   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3413   bitmap_clear (components);
3414
3415   /* The registers we need saved to the frame.  */
3416   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3417     if (aarch64_register_saved_on_entry (regno))
3418       {
3419         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3420         if (!frame_pointer_needed)
3421           offset += cfun->machine->frame.frame_size
3422                     - cfun->machine->frame.hard_fp_offset;
3423         /* Check that we can access the stack slot of the register with one
3424            direct load with no adjustments needed.  */
3425         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3426           bitmap_set_bit (components, regno);
3427       }
3428
3429   /* Don't mess with the hard frame pointer.  */
3430   if (frame_pointer_needed)
3431     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3432
3433   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3434   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3435   /* If aarch64_layout_frame has chosen registers to store/restore with
3436      writeback don't interfere with them to avoid having to output explicit
3437      stack adjustment instructions.  */
3438   if (reg2 != INVALID_REGNUM)
3439     bitmap_clear_bit (components, reg2);
3440   if (reg1 != INVALID_REGNUM)
3441     bitmap_clear_bit (components, reg1);
3442
3443   bitmap_clear_bit (components, LR_REGNUM);
3444   bitmap_clear_bit (components, SP_REGNUM);
3445
3446   return components;
3447 }
3448
3449 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3450
3451 static sbitmap
3452 aarch64_components_for_bb (basic_block bb)
3453 {
3454   bitmap in = DF_LIVE_IN (bb);
3455   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3456   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3457
3458   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3459   bitmap_clear (components);
3460
3461   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3462   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3463     if ((!call_used_regs[regno])
3464        && (bitmap_bit_p (in, regno)
3465            || bitmap_bit_p (gen, regno)
3466            || bitmap_bit_p (kill, regno)))
3467           bitmap_set_bit (components, regno);
3468
3469   return components;
3470 }
3471
3472 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3473    Nothing to do for aarch64.  */
3474
3475 static void
3476 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3477 {
3478 }
3479
3480 /* Return the next set bit in BMP from START onwards.  Return the total number
3481    of bits in BMP if no set bit is found at or after START.  */
3482
3483 static unsigned int
3484 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3485 {
3486   unsigned int nbits = SBITMAP_SIZE (bmp);
3487   if (start == nbits)
3488     return start;
3489
3490   gcc_assert (start < nbits);
3491   for (unsigned int i = start; i < nbits; i++)
3492     if (bitmap_bit_p (bmp, i))
3493       return i;
3494
3495   return nbits;
3496 }
3497
3498 /* Do the work for aarch64_emit_prologue_components and
3499    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3500    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3501    for these components or the epilogue sequence.  That is, it determines
3502    whether we should emit stores or loads and what kind of CFA notes to attach
3503    to the insns.  Otherwise the logic for the two sequences is very
3504    similar.  */
3505
3506 static void
3507 aarch64_process_components (sbitmap components, bool prologue_p)
3508 {
3509   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3510                              ? HARD_FRAME_POINTER_REGNUM
3511                              : STACK_POINTER_REGNUM);
3512
3513   unsigned last_regno = SBITMAP_SIZE (components);
3514   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3515   rtx_insn *insn = NULL;
3516
3517   while (regno != last_regno)
3518     {
3519       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3520          so DFmode for the vector registers is enough.  */
3521       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3522       rtx reg = gen_rtx_REG (mode, regno);
3523       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3524       if (!frame_pointer_needed)
3525         offset += cfun->machine->frame.frame_size
3526                   - cfun->machine->frame.hard_fp_offset;
3527       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3528       rtx mem = gen_frame_mem (mode, addr);
3529
3530       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3531       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3532       /* No more registers to handle after REGNO.
3533          Emit a single save/restore and exit.  */
3534       if (regno2 == last_regno)
3535         {
3536           insn = emit_insn (set);
3537           RTX_FRAME_RELATED_P (insn) = 1;
3538           if (prologue_p)
3539             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3540           else
3541             add_reg_note (insn, REG_CFA_RESTORE, reg);
3542           break;
3543         }
3544
3545       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3546       /* The next register is not of the same class or its offset is not
3547          mergeable with the current one into a pair.  */
3548       if (!satisfies_constraint_Ump (mem)
3549           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3550           || (offset2 - cfun->machine->frame.reg_offset[regno])
3551                 != GET_MODE_SIZE (mode))
3552         {
3553           insn = emit_insn (set);
3554           RTX_FRAME_RELATED_P (insn) = 1;
3555           if (prologue_p)
3556             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3557           else
3558             add_reg_note (insn, REG_CFA_RESTORE, reg);
3559
3560           regno = regno2;
3561           continue;
3562         }
3563
3564       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3565       rtx reg2 = gen_rtx_REG (mode, regno2);
3566       if (!frame_pointer_needed)
3567         offset2 += cfun->machine->frame.frame_size
3568                   - cfun->machine->frame.hard_fp_offset;
3569       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3570       rtx mem2 = gen_frame_mem (mode, addr2);
3571       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3572                              : gen_rtx_SET (reg2, mem2);
3573
3574       if (prologue_p)
3575         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3576       else
3577         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3578
3579       RTX_FRAME_RELATED_P (insn) = 1;
3580       if (prologue_p)
3581         {
3582           add_reg_note (insn, REG_CFA_OFFSET, set);
3583           add_reg_note (insn, REG_CFA_OFFSET, set2);
3584         }
3585       else
3586         {
3587           add_reg_note (insn, REG_CFA_RESTORE, reg);
3588           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3589         }
3590
3591       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3592     }
3593 }
3594
3595 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3596
3597 static void
3598 aarch64_emit_prologue_components (sbitmap components)
3599 {
3600   aarch64_process_components (components, true);
3601 }
3602
3603 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3604
3605 static void
3606 aarch64_emit_epilogue_components (sbitmap components)
3607 {
3608   aarch64_process_components (components, false);
3609 }
3610
3611 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3612
3613 static void
3614 aarch64_set_handled_components (sbitmap components)
3615 {
3616   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3617     if (bitmap_bit_p (components, regno))
3618       cfun->machine->reg_is_wrapped_separately[regno] = true;
3619 }
3620
3621 /* AArch64 stack frames generated by this compiler look like:
3622
3623         +-------------------------------+
3624         |                               |
3625         |  incoming stack arguments     |
3626         |                               |
3627         +-------------------------------+
3628         |                               | <-- incoming stack pointer (aligned)
3629         |  callee-allocated save area   |
3630         |  for register varargs         |
3631         |                               |
3632         +-------------------------------+
3633         |  local variables              | <-- frame_pointer_rtx
3634         |                               |
3635         +-------------------------------+
3636         |  padding0                     | \
3637         +-------------------------------+  |
3638         |  callee-saved registers       |  | frame.saved_regs_size
3639         +-------------------------------+  |
3640         |  LR'                          |  |
3641         +-------------------------------+  |
3642         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3643         +-------------------------------+
3644         |  dynamic allocation           |
3645         +-------------------------------+
3646         |  padding                      |
3647         +-------------------------------+
3648         |  outgoing stack arguments     | <-- arg_pointer
3649         |                               |
3650         +-------------------------------+
3651         |                               | <-- stack_pointer_rtx (aligned)
3652
3653    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3654    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3655    unchanged.  */
3656
3657 /* Generate the prologue instructions for entry into a function.
3658    Establish the stack frame by decreasing the stack pointer with a
3659    properly calculated size and, if necessary, create a frame record
3660    filled with the values of LR and previous frame pointer.  The
3661    current FP is also set up if it is in use.  */
3662
3663 void
3664 aarch64_expand_prologue (void)
3665 {
3666   aarch64_layout_frame ();
3667
3668   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3669   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3670   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3671   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3672   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3673   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3674   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3675   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
3676   rtx_insn *insn;
3677
3678   /* Sign return address for functions.  */
3679   if (aarch64_return_address_signing_enabled ())
3680     {
3681       insn = emit_insn (gen_pacisp ());
3682       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3683       RTX_FRAME_RELATED_P (insn) = 1;
3684     }
3685
3686   if (flag_stack_usage_info)
3687     current_function_static_stack_size = frame_size;
3688
3689   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3690     {
3691       if (crtl->is_leaf && !cfun->calls_alloca)
3692         {
3693           if (frame_size > PROBE_INTERVAL
3694               && frame_size > get_stack_check_protect ())
3695             aarch64_emit_probe_stack_range (get_stack_check_protect (),
3696                                             (frame_size
3697                                              - get_stack_check_protect ()));
3698         }
3699       else if (frame_size > 0)
3700         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
3701     }
3702
3703   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3704
3705   if (callee_adjust != 0)
3706     aarch64_push_regs (reg1, reg2, callee_adjust);
3707
3708   if (emit_frame_chain)
3709     {
3710       if (callee_adjust == 0)
3711         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3712                                    R30_REGNUM, false);
3713       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3714                                        stack_pointer_rtx,
3715                                        GEN_INT (callee_offset)));
3716       RTX_FRAME_RELATED_P (insn) = frame_pointer_needed;
3717       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3718     }
3719
3720   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3721                              callee_adjust != 0 || emit_frame_chain);
3722   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3723                              callee_adjust != 0 || emit_frame_chain);
3724   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3725 }
3726
3727 /* Return TRUE if we can use a simple_return insn.
3728
3729    This function checks whether the callee saved stack is empty, which
3730    means no restore actions are need. The pro_and_epilogue will use
3731    this to check whether shrink-wrapping opt is feasible.  */
3732
3733 bool
3734 aarch64_use_return_insn_p (void)
3735 {
3736   if (!reload_completed)
3737     return false;
3738
3739   if (crtl->profile)
3740     return false;
3741
3742   aarch64_layout_frame ();
3743
3744   return cfun->machine->frame.frame_size == 0;
3745 }
3746
3747 /* Generate the epilogue instructions for returning from a function.
3748    This is almost exactly the reverse of the prolog sequence, except
3749    that we need to insert barriers to avoid scheduling loads that read
3750    from a deallocated stack, and we optimize the unwind records by
3751    emitting them all together if possible.  */
3752 void
3753 aarch64_expand_epilogue (bool for_sibcall)
3754 {
3755   aarch64_layout_frame ();
3756
3757   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3758   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3759   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3760   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3761   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3762   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3763   rtx cfi_ops = NULL;
3764   rtx_insn *insn;
3765
3766   /* We need to add memory barrier to prevent read from deallocated stack.  */
3767   bool need_barrier_p = (get_frame_size ()
3768                          + cfun->machine->frame.saved_varargs_size) != 0;
3769
3770   /* Emit a barrier to prevent loads from a deallocated stack.  */
3771   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3772       || crtl->calls_eh_return)
3773     {
3774       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3775       need_barrier_p = false;
3776     }
3777
3778   /* Restore the stack pointer from the frame pointer if it may not
3779      be the same as the stack pointer.  */
3780   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3781     {
3782       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3783                                        hard_frame_pointer_rtx,
3784                                        GEN_INT (-callee_offset)));
3785       /* If writeback is used when restoring callee-saves, the CFA
3786          is restored on the instruction doing the writeback.  */
3787       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3788     }
3789   else
3790     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3791
3792   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3793                                 callee_adjust != 0, &cfi_ops);
3794   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3795                                 callee_adjust != 0, &cfi_ops);
3796
3797   if (need_barrier_p)
3798     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3799
3800   if (callee_adjust != 0)
3801     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3802
3803   if (callee_adjust != 0 || initial_adjust > 65536)
3804     {
3805       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3806       insn = get_last_insn ();
3807       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3808       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3809       RTX_FRAME_RELATED_P (insn) = 1;
3810       cfi_ops = NULL;
3811     }
3812
3813   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3814
3815   if (cfi_ops)
3816     {
3817       /* Emit delayed restores and reset the CFA to be SP.  */
3818       insn = get_last_insn ();
3819       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3820       REG_NOTES (insn) = cfi_ops;
3821       RTX_FRAME_RELATED_P (insn) = 1;
3822     }
3823
3824   /* We prefer to emit the combined return/authenticate instruction RETAA,
3825      however there are three cases in which we must instead emit an explicit
3826      authentication instruction.
3827
3828         1) Sibcalls don't return in a normal way, so if we're about to call one
3829            we must authenticate.
3830
3831         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3832            generating code for !TARGET_ARMV8_3 we can't use it and must
3833            explicitly authenticate.
3834
3835         3) On an eh_return path we make extra stack adjustments to update the
3836            canonical frame address to be the exception handler's CFA.  We want
3837            to authenticate using the CFA of the function which calls eh_return.
3838     */
3839   if (aarch64_return_address_signing_enabled ()
3840       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3841     {
3842       insn = emit_insn (gen_autisp ());
3843       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3844       RTX_FRAME_RELATED_P (insn) = 1;
3845     }
3846
3847   /* Stack adjustment for exception handler.  */
3848   if (crtl->calls_eh_return)
3849     {
3850       /* We need to unwind the stack by the offset computed by
3851          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3852          to be SP; letting the CFA move during this adjustment
3853          is just as correct as retaining the CFA from the body
3854          of the function.  Therefore, do nothing special.  */
3855       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3856     }
3857
3858   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3859   if (!for_sibcall)
3860     emit_jump_insn (ret_rtx);
3861 }
3862
3863 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3864    normally or return to a previous frame after unwinding.
3865
3866    An EH return uses a single shared return sequence.  The epilogue is
3867    exactly like a normal epilogue except that it has an extra input
3868    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3869    that must be applied after the frame has been destroyed.  An extra label
3870    is inserted before the epilogue which initializes this register to zero,
3871    and this is the entry point for a normal return.
3872
3873    An actual EH return updates the return address, initializes the stack
3874    adjustment and jumps directly into the epilogue (bypassing the zeroing
3875    of the adjustment).  Since the return address is typically saved on the
3876    stack when a function makes a call, the saved LR must be updated outside
3877    the epilogue.
3878
3879    This poses problems as the store is generated well before the epilogue,
3880    so the offset of LR is not known yet.  Also optimizations will remove the
3881    store as it appears dead, even after the epilogue is generated (as the
3882    base or offset for loading LR is different in many cases).
3883
3884    To avoid these problems this implementation forces the frame pointer
3885    in eh_return functions so that the location of LR is fixed and known early.
3886    It also marks the store volatile, so no optimization is permitted to
3887    remove the store.  */
3888 rtx
3889 aarch64_eh_return_handler_rtx (void)
3890 {
3891   rtx tmp = gen_frame_mem (Pmode,
3892     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3893
3894   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3895   MEM_VOLATILE_P (tmp) = true;
3896   return tmp;
3897 }
3898
3899 /* Output code to add DELTA to the first argument, and then jump
3900    to FUNCTION.  Used for C++ multiple inheritance.  */
3901 static void
3902 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3903                          HOST_WIDE_INT delta,
3904                          HOST_WIDE_INT vcall_offset,
3905                          tree function)
3906 {
3907   /* The this pointer is always in x0.  Note that this differs from
3908      Arm where the this pointer maybe bumped to r1 if r0 is required
3909      to return a pointer to an aggregate.  On AArch64 a result value
3910      pointer will be in x8.  */
3911   int this_regno = R0_REGNUM;
3912   rtx this_rtx, temp0, temp1, addr, funexp;
3913   rtx_insn *insn;
3914
3915   reload_completed = 1;
3916   emit_note (NOTE_INSN_PROLOGUE_END);
3917
3918   if (vcall_offset == 0)
3919     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3920   else
3921     {
3922       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3923
3924       this_rtx = gen_rtx_REG (Pmode, this_regno);
3925       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3926       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3927
3928       addr = this_rtx;
3929       if (delta != 0)
3930         {
3931           if (delta >= -256 && delta < 256)
3932             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3933                                        plus_constant (Pmode, this_rtx, delta));
3934           else
3935             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3936         }
3937
3938       if (Pmode == ptr_mode)
3939         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3940       else
3941         aarch64_emit_move (temp0,
3942                            gen_rtx_ZERO_EXTEND (Pmode,
3943                                                 gen_rtx_MEM (ptr_mode, addr)));
3944
3945       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3946           addr = plus_constant (Pmode, temp0, vcall_offset);
3947       else
3948         {
3949           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3950                                           Pmode);
3951           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3952         }
3953
3954       if (Pmode == ptr_mode)
3955         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3956       else
3957         aarch64_emit_move (temp1,
3958                            gen_rtx_SIGN_EXTEND (Pmode,
3959                                                 gen_rtx_MEM (ptr_mode, addr)));
3960
3961       emit_insn (gen_add2_insn (this_rtx, temp1));
3962     }
3963
3964   /* Generate a tail call to the target function.  */
3965   if (!TREE_USED (function))
3966     {
3967       assemble_external (function);
3968       TREE_USED (function) = 1;
3969     }
3970   funexp = XEXP (DECL_RTL (function), 0);
3971   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3972   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3973   SIBLING_CALL_P (insn) = 1;
3974
3975   insn = get_insns ();
3976   shorten_branches (insn);
3977   final_start_function (insn, file, 1);
3978   final (insn, file, 1);
3979   final_end_function ();
3980
3981   /* Stop pretending to be a post-reload pass.  */
3982   reload_completed = 0;
3983 }
3984
3985 static bool
3986 aarch64_tls_referenced_p (rtx x)
3987 {
3988   if (!TARGET_HAVE_TLS)
3989     return false;
3990   subrtx_iterator::array_type array;
3991   FOR_EACH_SUBRTX (iter, array, x, ALL)
3992     {
3993       const_rtx x = *iter;
3994       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3995         return true;
3996       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3997          TLS offsets, not real symbol references.  */
3998       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3999         iter.skip_subrtxes ();
4000     }
4001   return false;
4002 }
4003
4004
4005 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4006    a left shift of 0 or 12 bits.  */
4007 bool
4008 aarch64_uimm12_shift (HOST_WIDE_INT val)
4009 {
4010   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
4011           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
4012           );
4013 }
4014
4015
4016 /* Return true if val is an immediate that can be loaded into a
4017    register by a MOVZ instruction.  */
4018 static bool
4019 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
4020 {
4021   if (GET_MODE_SIZE (mode) > 4)
4022     {
4023       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
4024           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4025         return 1;
4026     }
4027   else
4028     {
4029       /* Ignore sign extension.  */
4030       val &= (HOST_WIDE_INT) 0xffffffff;
4031     }
4032   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4033           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4034 }
4035
4036 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4037
4038 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4039   {
4040     0x0000000100000001ull,
4041     0x0001000100010001ull,
4042     0x0101010101010101ull,
4043     0x1111111111111111ull,
4044     0x5555555555555555ull,
4045   };
4046
4047
4048 /* Return true if val is a valid bitmask immediate.  */
4049
4050 bool
4051 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4052 {
4053   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4054   int bits;
4055
4056   /* Check for a single sequence of one bits and return quickly if so.
4057      The special cases of all ones and all zeroes returns false.  */
4058   val = (unsigned HOST_WIDE_INT) val_in;
4059   tmp = val + (val & -val);
4060
4061   if (tmp == (tmp & -tmp))
4062     return (val + 1) > 1;
4063
4064   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4065   if (mode == SImode)
4066     val = (val << 32) | (val & 0xffffffff);
4067
4068   /* Invert if the immediate doesn't start with a zero bit - this means we
4069      only need to search for sequences of one bits.  */
4070   if (val & 1)
4071     val = ~val;
4072
4073   /* Find the first set bit and set tmp to val with the first sequence of one
4074      bits removed.  Return success if there is a single sequence of ones.  */
4075   first_one = val & -val;
4076   tmp = val & (val + first_one);
4077
4078   if (tmp == 0)
4079     return true;
4080
4081   /* Find the next set bit and compute the difference in bit position.  */
4082   next_one = tmp & -tmp;
4083   bits = clz_hwi (first_one) - clz_hwi (next_one);
4084   mask = val ^ tmp;
4085
4086   /* Check the bit position difference is a power of 2, and that the first
4087      sequence of one bits fits within 'bits' bits.  */
4088   if ((mask >> bits) != 0 || bits != (bits & -bits))
4089     return false;
4090
4091   /* Check the sequence of one bits is repeated 64/bits times.  */
4092   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4093 }
4094
4095 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4096    Assumed precondition: VAL_IN Is not zero.  */
4097
4098 unsigned HOST_WIDE_INT
4099 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4100 {
4101   int lowest_bit_set = ctz_hwi (val_in);
4102   int highest_bit_set = floor_log2 (val_in);
4103   gcc_assert (val_in != 0);
4104
4105   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4106           (HOST_WIDE_INT_1U << lowest_bit_set));
4107 }
4108
4109 /* Create constant where bits outside of lowest bit set to highest bit set
4110    are set to 1.  */
4111
4112 unsigned HOST_WIDE_INT
4113 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4114 {
4115   return val_in | ~aarch64_and_split_imm1 (val_in);
4116 }
4117
4118 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4119
4120 bool
4121 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4122 {
4123   scalar_int_mode int_mode;
4124   if (!is_a <scalar_int_mode> (mode, &int_mode))
4125     return false;
4126
4127   if (aarch64_bitmask_imm (val_in, int_mode))
4128     return false;
4129
4130   if (aarch64_move_imm (val_in, int_mode))
4131     return false;
4132
4133   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4134
4135   return aarch64_bitmask_imm (imm2, int_mode);
4136 }
4137
4138 /* Return true if val is an immediate that can be loaded into a
4139    register in a single instruction.  */
4140 bool
4141 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4142 {
4143   scalar_int_mode int_mode;
4144   if (!is_a <scalar_int_mode> (mode, &int_mode))
4145     return false;
4146
4147   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4148     return 1;
4149   return aarch64_bitmask_imm (val, int_mode);
4150 }
4151
4152 static bool
4153 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4154 {
4155   rtx base, offset;
4156
4157   if (GET_CODE (x) == HIGH)
4158     return true;
4159
4160   split_const (x, &base, &offset);
4161   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4162     {
4163       if (aarch64_classify_symbol (base, offset)
4164           != SYMBOL_FORCE_TO_MEM)
4165         return true;
4166       else
4167         /* Avoid generating a 64-bit relocation in ILP32; leave
4168            to aarch64_expand_mov_immediate to handle it properly.  */
4169         return mode != ptr_mode;
4170     }
4171
4172   return aarch64_tls_referenced_p (x);
4173 }
4174
4175 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4176    The expansion for a table switch is quite expensive due to the number
4177    of instructions, the table lookup and hard to predict indirect jump.
4178    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4179    set, otherwise use tables for > 16 cases as a tradeoff between size and
4180    performance.  When optimizing for size, use the default setting.  */
4181
4182 static unsigned int
4183 aarch64_case_values_threshold (void)
4184 {
4185   /* Use the specified limit for the number of cases before using jump
4186      tables at higher optimization levels.  */
4187   if (optimize > 2
4188       && selected_cpu->tune->max_case_values != 0)
4189     return selected_cpu->tune->max_case_values;
4190   else
4191     return optimize_size ? default_case_values_threshold () : 17;
4192 }
4193
4194 /* Return true if register REGNO is a valid index register.
4195    STRICT_P is true if REG_OK_STRICT is in effect.  */
4196
4197 bool
4198 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4199 {
4200   if (!HARD_REGISTER_NUM_P (regno))
4201     {
4202       if (!strict_p)
4203         return true;
4204
4205       if (!reg_renumber)
4206         return false;
4207
4208       regno = reg_renumber[regno];
4209     }
4210   return GP_REGNUM_P (regno);
4211 }
4212
4213 /* Return true if register REGNO is a valid base register for mode MODE.
4214    STRICT_P is true if REG_OK_STRICT is in effect.  */
4215
4216 bool
4217 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4218 {
4219   if (!HARD_REGISTER_NUM_P (regno))
4220     {
4221       if (!strict_p)
4222         return true;
4223
4224       if (!reg_renumber)
4225         return false;
4226
4227       regno = reg_renumber[regno];
4228     }
4229
4230   /* The fake registers will be eliminated to either the stack or
4231      hard frame pointer, both of which are usually valid base registers.
4232      Reload deals with the cases where the eliminated form isn't valid.  */
4233   return (GP_REGNUM_P (regno)
4234           || regno == SP_REGNUM
4235           || regno == FRAME_POINTER_REGNUM
4236           || regno == ARG_POINTER_REGNUM);
4237 }
4238
4239 /* Return true if X is a valid base register for mode MODE.
4240    STRICT_P is true if REG_OK_STRICT is in effect.  */
4241
4242 static bool
4243 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4244 {
4245   if (!strict_p
4246       && GET_CODE (x) == SUBREG
4247       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4248     x = SUBREG_REG (x);
4249
4250   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4251 }
4252
4253 /* Return true if address offset is a valid index.  If it is, fill in INFO
4254    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4255
4256 static bool
4257 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4258                         machine_mode mode, bool strict_p)
4259 {
4260   enum aarch64_address_type type;
4261   rtx index;
4262   int shift;
4263
4264   /* (reg:P) */
4265   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4266       && GET_MODE (x) == Pmode)
4267     {
4268       type = ADDRESS_REG_REG;
4269       index = x;
4270       shift = 0;
4271     }
4272   /* (sign_extend:DI (reg:SI)) */
4273   else if ((GET_CODE (x) == SIGN_EXTEND
4274             || GET_CODE (x) == ZERO_EXTEND)
4275            && GET_MODE (x) == DImode
4276            && GET_MODE (XEXP (x, 0)) == SImode)
4277     {
4278       type = (GET_CODE (x) == SIGN_EXTEND)
4279         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280       index = XEXP (x, 0);
4281       shift = 0;
4282     }
4283   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4284   else if (GET_CODE (x) == MULT
4285            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4286                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4287            && GET_MODE (XEXP (x, 0)) == DImode
4288            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4289            && CONST_INT_P (XEXP (x, 1)))
4290     {
4291       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4292         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293       index = XEXP (XEXP (x, 0), 0);
4294       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4295     }
4296   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4297   else if (GET_CODE (x) == ASHIFT
4298            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4299                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4300            && GET_MODE (XEXP (x, 0)) == DImode
4301            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4302            && CONST_INT_P (XEXP (x, 1)))
4303     {
4304       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4305         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4306       index = XEXP (XEXP (x, 0), 0);
4307       shift = INTVAL (XEXP (x, 1));
4308     }
4309   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4310   else if ((GET_CODE (x) == SIGN_EXTRACT
4311             || GET_CODE (x) == ZERO_EXTRACT)
4312            && GET_MODE (x) == DImode
4313            && GET_CODE (XEXP (x, 0)) == MULT
4314            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4315            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4316     {
4317       type = (GET_CODE (x) == SIGN_EXTRACT)
4318         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4319       index = XEXP (XEXP (x, 0), 0);
4320       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4321       if (INTVAL (XEXP (x, 1)) != 32 + shift
4322           || INTVAL (XEXP (x, 2)) != 0)
4323         shift = -1;
4324     }
4325   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4326      (const_int 0xffffffff<<shift)) */
4327   else if (GET_CODE (x) == AND
4328            && GET_MODE (x) == DImode
4329            && GET_CODE (XEXP (x, 0)) == MULT
4330            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4331            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4332            && CONST_INT_P (XEXP (x, 1)))
4333     {
4334       type = ADDRESS_REG_UXTW;
4335       index = XEXP (XEXP (x, 0), 0);
4336       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4337       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4338         shift = -1;
4339     }
4340   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4341   else if ((GET_CODE (x) == SIGN_EXTRACT
4342             || GET_CODE (x) == ZERO_EXTRACT)
4343            && GET_MODE (x) == DImode
4344            && GET_CODE (XEXP (x, 0)) == ASHIFT
4345            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4346            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4347     {
4348       type = (GET_CODE (x) == SIGN_EXTRACT)
4349         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4350       index = XEXP (XEXP (x, 0), 0);
4351       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4352       if (INTVAL (XEXP (x, 1)) != 32 + shift
4353           || INTVAL (XEXP (x, 2)) != 0)
4354         shift = -1;
4355     }
4356   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4357      (const_int 0xffffffff<<shift)) */
4358   else if (GET_CODE (x) == AND
4359            && GET_MODE (x) == DImode
4360            && GET_CODE (XEXP (x, 0)) == ASHIFT
4361            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4362            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4363            && CONST_INT_P (XEXP (x, 1)))
4364     {
4365       type = ADDRESS_REG_UXTW;
4366       index = XEXP (XEXP (x, 0), 0);
4367       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4368       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4369         shift = -1;
4370     }
4371   /* (mult:P (reg:P) (const_int scale)) */
4372   else if (GET_CODE (x) == MULT
4373            && GET_MODE (x) == Pmode
4374            && GET_MODE (XEXP (x, 0)) == Pmode
4375            && CONST_INT_P (XEXP (x, 1)))
4376     {
4377       type = ADDRESS_REG_REG;
4378       index = XEXP (x, 0);
4379       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4380     }
4381   /* (ashift:P (reg:P) (const_int shift)) */
4382   else if (GET_CODE (x) == ASHIFT
4383            && GET_MODE (x) == Pmode
4384            && GET_MODE (XEXP (x, 0)) == Pmode
4385            && CONST_INT_P (XEXP (x, 1)))
4386     {
4387       type = ADDRESS_REG_REG;
4388       index = XEXP (x, 0);
4389       shift = INTVAL (XEXP (x, 1));
4390     }
4391   else
4392     return false;
4393
4394   if (!strict_p
4395       && GET_CODE (index) == SUBREG
4396       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4397     index = SUBREG_REG (index);
4398
4399   if ((shift == 0 ||
4400        (shift > 0 && shift <= 3
4401         && (1 << shift) == GET_MODE_SIZE (mode)))
4402       && REG_P (index)
4403       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4404     {
4405       info->type = type;
4406       info->offset = index;
4407       info->shift = shift;
4408       return true;
4409     }
4410
4411   return false;
4412 }
4413
4414 /* Return true if MODE is one of the modes for which we
4415    support LDP/STP operations.  */
4416
4417 static bool
4418 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4419 {
4420   return mode == SImode || mode == DImode
4421          || mode == SFmode || mode == DFmode
4422          || (aarch64_vector_mode_supported_p (mode)
4423              && GET_MODE_SIZE (mode) == 8);
4424 }
4425
4426 /* Return true if REGNO is a virtual pointer register, or an eliminable
4427    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4428    include stack_pointer or hard_frame_pointer.  */
4429 static bool
4430 virt_or_elim_regno_p (unsigned regno)
4431 {
4432   return ((regno >= FIRST_VIRTUAL_REGISTER
4433            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4434           || regno == FRAME_POINTER_REGNUM
4435           || regno == ARG_POINTER_REGNUM);
4436 }
4437
4438 /* Return true if X is a valid address for machine mode MODE.  If it is,
4439    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4440    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4441
4442 static bool
4443 aarch64_classify_address (struct aarch64_address_info *info,
4444                           rtx x, machine_mode mode,
4445                           RTX_CODE outer_code, bool strict_p)
4446 {
4447   enum rtx_code code = GET_CODE (x);
4448   rtx op0, op1;
4449
4450   /* On BE, we use load/store pair for all large int mode load/stores.
4451      TI/TFmode may also use a load/store pair.  */
4452   bool load_store_pair_p = (outer_code == PARALLEL
4453                             || mode == TImode
4454                             || mode == TFmode
4455                             || (BYTES_BIG_ENDIAN
4456                                 && aarch64_vect_struct_mode_p (mode)));
4457
4458   bool allow_reg_index_p =
4459     !load_store_pair_p
4460     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4461     && !aarch64_vect_struct_mode_p (mode);
4462
4463   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4464      REG addressing.  */
4465   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4466       && (code != POST_INC && code != REG))
4467     return false;
4468
4469   switch (code)
4470     {
4471     case REG:
4472     case SUBREG:
4473       info->type = ADDRESS_REG_IMM;
4474       info->base = x;
4475       info->offset = const0_rtx;
4476       return aarch64_base_register_rtx_p (x, strict_p);
4477
4478     case PLUS:
4479       op0 = XEXP (x, 0);
4480       op1 = XEXP (x, 1);
4481
4482       if (! strict_p
4483           && REG_P (op0)
4484           && virt_or_elim_regno_p (REGNO (op0))
4485           && CONST_INT_P (op1))
4486         {
4487           info->type = ADDRESS_REG_IMM;
4488           info->base = op0;
4489           info->offset = op1;
4490
4491           return true;
4492         }
4493
4494       if (GET_MODE_SIZE (mode) != 0
4495           && CONST_INT_P (op1)
4496           && aarch64_base_register_rtx_p (op0, strict_p))
4497         {
4498           HOST_WIDE_INT offset = INTVAL (op1);
4499
4500           info->type = ADDRESS_REG_IMM;
4501           info->base = op0;
4502           info->offset = op1;
4503
4504           /* TImode and TFmode values are allowed in both pairs of X
4505              registers and individual Q registers.  The available
4506              address modes are:
4507              X,X: 7-bit signed scaled offset
4508              Q:   9-bit signed offset
4509              We conservatively require an offset representable in either mode.
4510              When performing the check for pairs of X registers i.e.  LDP/STP
4511              pass down DImode since that is the natural size of the LDP/STP
4512              instruction memory accesses.  */
4513           if (mode == TImode || mode == TFmode)
4514             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4515                     && (offset_9bit_signed_unscaled_p (mode, offset)
4516                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4517
4518           /* A 7bit offset check because OImode will emit a ldp/stp
4519              instruction (only big endian will get here).
4520              For ldp/stp instructions, the offset is scaled for the size of a
4521              single element of the pair.  */
4522           if (mode == OImode)
4523             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4524
4525           /* Three 9/12 bit offsets checks because CImode will emit three
4526              ldr/str instructions (only big endian will get here).  */
4527           if (mode == CImode)
4528             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4529                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4530                         || offset_12bit_unsigned_scaled_p (V16QImode,
4531                                                            offset + 32)));
4532
4533           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4534              instructions (only big endian will get here).  */
4535           if (mode == XImode)
4536             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4537                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4538                                                             offset + 32));
4539
4540           if (load_store_pair_p)
4541             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4542                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4543           else
4544             return (offset_9bit_signed_unscaled_p (mode, offset)
4545                     || offset_12bit_unsigned_scaled_p (mode, offset));
4546         }
4547
4548       if (allow_reg_index_p)
4549         {
4550           /* Look for base + (scaled/extended) index register.  */
4551           if (aarch64_base_register_rtx_p (op0, strict_p)
4552               && aarch64_classify_index (info, op1, mode, strict_p))
4553             {
4554               info->base = op0;
4555               return true;
4556             }
4557           if (aarch64_base_register_rtx_p (op1, strict_p)
4558               && aarch64_classify_index (info, op0, mode, strict_p))
4559             {
4560               info->base = op1;
4561               return true;
4562             }
4563         }
4564
4565       return false;
4566
4567     case POST_INC:
4568     case POST_DEC:
4569     case PRE_INC:
4570     case PRE_DEC:
4571       info->type = ADDRESS_REG_WB;
4572       info->base = XEXP (x, 0);
4573       info->offset = NULL_RTX;
4574       return aarch64_base_register_rtx_p (info->base, strict_p);
4575
4576     case POST_MODIFY:
4577     case PRE_MODIFY:
4578       info->type = ADDRESS_REG_WB;
4579       info->base = XEXP (x, 0);
4580       if (GET_CODE (XEXP (x, 1)) == PLUS
4581           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4582           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4583           && aarch64_base_register_rtx_p (info->base, strict_p))
4584         {
4585           HOST_WIDE_INT offset;
4586           info->offset = XEXP (XEXP (x, 1), 1);
4587           offset = INTVAL (info->offset);
4588
4589           /* TImode and TFmode values are allowed in both pairs of X
4590              registers and individual Q registers.  The available
4591              address modes are:
4592              X,X: 7-bit signed scaled offset
4593              Q:   9-bit signed offset
4594              We conservatively require an offset representable in either mode.
4595            */
4596           if (mode == TImode || mode == TFmode)
4597             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4598                     && offset_9bit_signed_unscaled_p (mode, offset));
4599
4600           if (load_store_pair_p)
4601             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4602                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4603           else
4604             return offset_9bit_signed_unscaled_p (mode, offset);
4605         }
4606       return false;
4607
4608     case CONST:
4609     case SYMBOL_REF:
4610     case LABEL_REF:
4611       /* load literal: pc-relative constant pool entry.  Only supported
4612          for SI mode or larger.  */
4613       info->type = ADDRESS_SYMBOLIC;
4614
4615       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4616         {
4617           rtx sym, addend;
4618
4619           split_const (x, &sym, &addend);
4620           return ((GET_CODE (sym) == LABEL_REF
4621                    || (GET_CODE (sym) == SYMBOL_REF
4622                        && CONSTANT_POOL_ADDRESS_P (sym)
4623                        && aarch64_pcrelative_literal_loads)));
4624         }
4625       return false;
4626
4627     case LO_SUM:
4628       info->type = ADDRESS_LO_SUM;
4629       info->base = XEXP (x, 0);
4630       info->offset = XEXP (x, 1);
4631       if (allow_reg_index_p
4632           && aarch64_base_register_rtx_p (info->base, strict_p))
4633         {
4634           rtx sym, offs;
4635           split_const (info->offset, &sym, &offs);
4636           if (GET_CODE (sym) == SYMBOL_REF
4637               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4638             {
4639               /* The symbol and offset must be aligned to the access size.  */
4640               unsigned int align;
4641               unsigned int ref_size;
4642
4643               if (CONSTANT_POOL_ADDRESS_P (sym))
4644                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4645               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4646                 {
4647                   tree exp = SYMBOL_REF_DECL (sym);
4648                   align = TYPE_ALIGN (TREE_TYPE (exp));
4649                   align = aarch64_constant_alignment (exp, align);
4650                 }
4651               else if (SYMBOL_REF_DECL (sym))
4652                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4653               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4654                        && SYMBOL_REF_BLOCK (sym) != NULL)
4655                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4656               else
4657                 align = BITS_PER_UNIT;
4658
4659               ref_size = GET_MODE_SIZE (mode);
4660               if (ref_size == 0)
4661                 ref_size = GET_MODE_SIZE (DImode);
4662
4663               return ((INTVAL (offs) & (ref_size - 1)) == 0
4664                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4665             }
4666         }
4667       return false;
4668
4669     default:
4670       return false;
4671     }
4672 }
4673
4674 /* Return true if the address X is valid for a PRFM instruction.
4675    STRICT_P is true if we should do strict checking with
4676    aarch64_classify_address.  */
4677
4678 bool
4679 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4680 {
4681   struct aarch64_address_info addr;
4682
4683   /* PRFM accepts the same addresses as DImode...  */
4684   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4685   if (!res)
4686     return false;
4687
4688   /* ... except writeback forms.  */
4689   return addr.type != ADDRESS_REG_WB;
4690 }
4691
4692 bool
4693 aarch64_symbolic_address_p (rtx x)
4694 {
4695   rtx offset;
4696
4697   split_const (x, &x, &offset);
4698   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4699 }
4700
4701 /* Classify the base of symbolic expression X.  */
4702
4703 enum aarch64_symbol_type
4704 aarch64_classify_symbolic_expression (rtx x)
4705 {
4706   rtx offset;
4707
4708   split_const (x, &x, &offset);
4709   return aarch64_classify_symbol (x, offset);
4710 }
4711
4712
4713 /* Return TRUE if X is a legitimate address for accessing memory in
4714    mode MODE.  */
4715 static bool
4716 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4717 {
4718   struct aarch64_address_info addr;
4719
4720   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4721 }
4722
4723 /* Return TRUE if X is a legitimate address for accessing memory in
4724    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4725    pair operation.  */
4726 bool
4727 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4728                               RTX_CODE outer_code, bool strict_p)
4729 {
4730   struct aarch64_address_info addr;
4731
4732   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4733 }
4734
4735 /* Split an out-of-range address displacement into a base and offset.
4736    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4737    to increase opportunities for sharing the base address of different sizes.
4738    Unaligned accesses use the signed 9-bit range, TImode/TFmode use
4739    the intersection of signed scaled 7-bit and signed 9-bit offset.  */
4740 static bool
4741 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4742 {
4743   HOST_WIDE_INT offset = INTVAL (*disp);
4744   HOST_WIDE_INT base;
4745
4746   if (mode == TImode || mode == TFmode)
4747     base = (offset + 0x100) & ~0x1f8;
4748   else if ((offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4749     base = (offset + 0x100) & ~0x1ff;
4750   else
4751     base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4752
4753   *off = GEN_INT (base);
4754   *disp = GEN_INT (offset - base);
4755   return true;
4756 }
4757
4758 /* Return the binary representation of floating point constant VALUE in INTVAL.
4759    If the value cannot be converted, return false without setting INTVAL.
4760    The conversion is done in the given MODE.  */
4761 bool
4762 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4763 {
4764
4765   /* We make a general exception for 0.  */
4766   if (aarch64_float_const_zero_rtx_p (value))
4767     {
4768       *intval = 0;
4769       return true;
4770     }
4771
4772   machine_mode mode = GET_MODE (value);
4773   if (GET_CODE (value) != CONST_DOUBLE
4774       || !SCALAR_FLOAT_MODE_P (mode)
4775       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4776       /* Only support up to DF mode.  */
4777       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4778     return false;
4779
4780   unsigned HOST_WIDE_INT ival = 0;
4781
4782   long res[2];
4783   real_to_target (res,
4784                   CONST_DOUBLE_REAL_VALUE (value),
4785                   REAL_MODE_FORMAT (mode));
4786
4787   if (mode == DFmode)
4788     {
4789       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4790       ival = zext_hwi (res[order], 32);
4791       ival |= (zext_hwi (res[1 - order], 32) << 32);
4792     }
4793   else
4794       ival = zext_hwi (res[0], 32);
4795
4796   *intval = ival;
4797   return true;
4798 }
4799
4800 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4801    single MOV(+MOVK) followed by an FMOV.  */
4802 bool
4803 aarch64_float_const_rtx_p (rtx x)
4804 {
4805   machine_mode mode = GET_MODE (x);
4806   if (mode == VOIDmode)
4807     return false;
4808
4809   /* Determine whether it's cheaper to write float constants as
4810      mov/movk pairs over ldr/adrp pairs.  */
4811   unsigned HOST_WIDE_INT ival;
4812
4813   if (GET_CODE (x) == CONST_DOUBLE
4814       && SCALAR_FLOAT_MODE_P (mode)
4815       && aarch64_reinterpret_float_as_int (x, &ival))
4816     {
4817       scalar_int_mode imode = (mode == HFmode
4818                                ? SImode
4819                                : int_mode_for_mode (mode).require ());
4820       int num_instr = aarch64_internal_mov_immediate
4821                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4822       return num_instr < 3;
4823     }
4824
4825   return false;
4826 }
4827
4828 /* Return TRUE if rtx X is immediate constant 0.0 */
4829 bool
4830 aarch64_float_const_zero_rtx_p (rtx x)
4831 {
4832   if (GET_MODE (x) == VOIDmode)
4833     return false;
4834
4835   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4836     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4837   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4838 }
4839
4840 /* Return TRUE if rtx X is immediate constant that fits in a single
4841    MOVI immediate operation.  */
4842 bool
4843 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4844 {
4845   if (!TARGET_SIMD)
4846      return false;
4847
4848   machine_mode vmode;
4849   scalar_int_mode imode;
4850   unsigned HOST_WIDE_INT ival;
4851
4852   if (GET_CODE (x) == CONST_DOUBLE
4853       && SCALAR_FLOAT_MODE_P (mode))
4854     {
4855       if (!aarch64_reinterpret_float_as_int (x, &ival))
4856         return false;
4857
4858       /* We make a general exception for 0.  */
4859       if (aarch64_float_const_zero_rtx_p (x))
4860         return true;
4861
4862       imode = int_mode_for_mode (mode).require ();
4863     }
4864   else if (GET_CODE (x) == CONST_INT
4865            && is_a <scalar_int_mode> (mode, &imode))
4866     ival = INTVAL (x);
4867   else
4868     return false;
4869
4870    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4871      a 128 bit vector mode.  */
4872   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4873
4874   vmode = aarch64_simd_container_mode (imode, width);
4875   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4876
4877   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4878 }
4879
4880
4881 /* Return the fixed registers used for condition codes.  */
4882
4883 static bool
4884 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4885 {
4886   *p1 = CC_REGNUM;
4887   *p2 = INVALID_REGNUM;
4888   return true;
4889 }
4890
4891 /* This function is used by the call expanders of the machine description.
4892    RESULT is the register in which the result is returned.  It's NULL for
4893    "call" and "sibcall".
4894    MEM is the location of the function call.
4895    SIBCALL indicates whether this function call is normal call or sibling call.
4896    It will generate different pattern accordingly.  */
4897
4898 void
4899 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4900 {
4901   rtx call, callee, tmp;
4902   rtvec vec;
4903   machine_mode mode;
4904
4905   gcc_assert (MEM_P (mem));
4906   callee = XEXP (mem, 0);
4907   mode = GET_MODE (callee);
4908   gcc_assert (mode == Pmode);
4909
4910   /* Decide if we should generate indirect calls by loading the
4911      address of the callee into a register before performing
4912      the branch-and-link.  */
4913   if (SYMBOL_REF_P (callee)
4914       ? (aarch64_is_long_call_p (callee)
4915          || aarch64_is_noplt_call_p (callee))
4916       : !REG_P (callee))
4917     XEXP (mem, 0) = force_reg (mode, callee);
4918
4919   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4920
4921   if (result != NULL_RTX)
4922     call = gen_rtx_SET (result, call);
4923
4924   if (sibcall)
4925     tmp = ret_rtx;
4926   else
4927     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4928
4929   vec = gen_rtvec (2, call, tmp);
4930   call = gen_rtx_PARALLEL (VOIDmode, vec);
4931
4932   aarch64_emit_call_insn (call);
4933 }
4934
4935 /* Emit call insn with PAT and do aarch64-specific handling.  */
4936
4937 void
4938 aarch64_emit_call_insn (rtx pat)
4939 {
4940   rtx insn = emit_call_insn (pat);
4941
4942   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4943   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4944   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4945 }
4946
4947 machine_mode
4948 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4949 {
4950   /* All floating point compares return CCFP if it is an equality
4951      comparison, and CCFPE otherwise.  */
4952   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4953     {
4954       switch (code)
4955         {
4956         case EQ:
4957         case NE:
4958         case UNORDERED:
4959         case ORDERED:
4960         case UNLT:
4961         case UNLE:
4962         case UNGT:
4963         case UNGE:
4964         case UNEQ:
4965         case LTGT:
4966           return CCFPmode;
4967
4968         case LT:
4969         case LE:
4970         case GT:
4971         case GE:
4972           return CCFPEmode;
4973
4974         default:
4975           gcc_unreachable ();
4976         }
4977     }
4978
4979   /* Equality comparisons of short modes against zero can be performed
4980      using the TST instruction with the appropriate bitmask.  */
4981   if (y == const0_rtx && REG_P (x)
4982       && (code == EQ || code == NE)
4983       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4984     return CC_NZmode;
4985
4986   /* Similarly, comparisons of zero_extends from shorter modes can
4987      be performed using an ANDS with an immediate mask.  */
4988   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4989       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4990       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4991       && (code == EQ || code == NE))
4992     return CC_NZmode;
4993
4994   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4995       && y == const0_rtx
4996       && (code == EQ || code == NE || code == LT || code == GE)
4997       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4998           || GET_CODE (x) == NEG
4999           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
5000               && CONST_INT_P (XEXP (x, 2)))))
5001     return CC_NZmode;
5002
5003   /* A compare with a shifted operand.  Because of canonicalization,
5004      the comparison will have to be swapped when we emit the assembly
5005      code.  */
5006   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
5007       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
5008       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
5009           || GET_CODE (x) == LSHIFTRT
5010           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
5011     return CC_SWPmode;
5012
5013   /* Similarly for a negated operand, but we can only do this for
5014      equalities.  */
5015   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
5016       && (REG_P (y) || GET_CODE (y) == SUBREG)
5017       && (code == EQ || code == NE)
5018       && GET_CODE (x) == NEG)
5019     return CC_Zmode;
5020
5021   /* A test for unsigned overflow.  */
5022   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
5023       && code == NE
5024       && GET_CODE (x) == PLUS
5025       && GET_CODE (y) == ZERO_EXTEND)
5026     return CC_Cmode;
5027
5028   /* For everything else, return CCmode.  */
5029   return CCmode;
5030 }
5031
5032 static int
5033 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5034
5035 int
5036 aarch64_get_condition_code (rtx x)
5037 {
5038   machine_mode mode = GET_MODE (XEXP (x, 0));
5039   enum rtx_code comp_code = GET_CODE (x);
5040
5041   if (GET_MODE_CLASS (mode) != MODE_CC)
5042     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5043   return aarch64_get_condition_code_1 (mode, comp_code);
5044 }
5045
5046 static int
5047 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5048 {
5049   switch (mode)
5050     {
5051     case E_CCFPmode:
5052     case E_CCFPEmode:
5053       switch (comp_code)
5054         {
5055         case GE: return AARCH64_GE;
5056         case GT: return AARCH64_GT;
5057         case LE: return AARCH64_LS;
5058         case LT: return AARCH64_MI;
5059         case NE: return AARCH64_NE;
5060         case EQ: return AARCH64_EQ;
5061         case ORDERED: return AARCH64_VC;
5062         case UNORDERED: return AARCH64_VS;
5063         case UNLT: return AARCH64_LT;
5064         case UNLE: return AARCH64_LE;
5065         case UNGT: return AARCH64_HI;
5066         case UNGE: return AARCH64_PL;
5067         default: return -1;
5068         }
5069       break;
5070
5071     case E_CCmode:
5072       switch (comp_code)
5073         {
5074         case NE: return AARCH64_NE;
5075         case EQ: return AARCH64_EQ;
5076         case GE: return AARCH64_GE;
5077         case GT: return AARCH64_GT;
5078         case LE: return AARCH64_LE;
5079         case LT: return AARCH64_LT;
5080         case GEU: return AARCH64_CS;
5081         case GTU: return AARCH64_HI;
5082         case LEU: return AARCH64_LS;
5083         case LTU: return AARCH64_CC;
5084         default: return -1;
5085         }
5086       break;
5087
5088     case E_CC_SWPmode:
5089       switch (comp_code)
5090         {
5091         case NE: return AARCH64_NE;
5092         case EQ: return AARCH64_EQ;
5093         case GE: return AARCH64_LE;
5094         case GT: return AARCH64_LT;
5095         case LE: return AARCH64_GE;
5096         case LT: return AARCH64_GT;
5097         case GEU: return AARCH64_LS;
5098         case GTU: return AARCH64_CC;
5099         case LEU: return AARCH64_CS;
5100         case LTU: return AARCH64_HI;
5101         default: return -1;
5102         }
5103       break;
5104
5105     case E_CC_NZmode:
5106       switch (comp_code)
5107         {
5108         case NE: return AARCH64_NE;
5109         case EQ: return AARCH64_EQ;
5110         case GE: return AARCH64_PL;
5111         case LT: return AARCH64_MI;
5112         default: return -1;
5113         }
5114       break;
5115
5116     case E_CC_Zmode:
5117       switch (comp_code)
5118         {
5119         case NE: return AARCH64_NE;
5120         case EQ: return AARCH64_EQ;
5121         default: return -1;
5122         }
5123       break;
5124
5125     case E_CC_Cmode:
5126       switch (comp_code)
5127         {
5128         case NE: return AARCH64_CS;
5129         case EQ: return AARCH64_CC;
5130         default: return -1;
5131         }
5132       break;
5133
5134     default:
5135       return -1;
5136     }
5137
5138   return -1;
5139 }
5140
5141 bool
5142 aarch64_const_vec_all_same_in_range_p (rtx x,
5143                                   HOST_WIDE_INT minval,
5144                                   HOST_WIDE_INT maxval)
5145 {
5146   HOST_WIDE_INT firstval;
5147   int count, i;
5148
5149   if (GET_CODE (x) != CONST_VECTOR
5150       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5151     return false;
5152
5153   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5154   if (firstval < minval || firstval > maxval)
5155     return false;
5156
5157   count = CONST_VECTOR_NUNITS (x);
5158   for (i = 1; i < count; i++)
5159     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5160       return false;
5161
5162   return true;
5163 }
5164
5165 bool
5166 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5167 {
5168   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5169 }
5170
5171
5172 /* N Z C V.  */
5173 #define AARCH64_CC_V 1
5174 #define AARCH64_CC_C (1 << 1)
5175 #define AARCH64_CC_Z (1 << 2)
5176 #define AARCH64_CC_N (1 << 3)
5177
5178 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5179 static const int aarch64_nzcv_codes[] =
5180 {
5181   0,            /* EQ, Z == 1.  */
5182   AARCH64_CC_Z, /* NE, Z == 0.  */
5183   0,            /* CS, C == 1.  */
5184   AARCH64_CC_C, /* CC, C == 0.  */
5185   0,            /* MI, N == 1.  */
5186   AARCH64_CC_N, /* PL, N == 0.  */
5187   0,            /* VS, V == 1.  */
5188   AARCH64_CC_V, /* VC, V == 0.  */
5189   0,            /* HI, C ==1 && Z == 0.  */
5190   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5191   AARCH64_CC_V, /* GE, N == V.  */
5192   0,            /* LT, N != V.  */
5193   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5194   0,            /* LE, !(Z == 0 && N == V).  */
5195   0,            /* AL, Any.  */
5196   0             /* NV, Any.  */
5197 };
5198
5199 /* Print operand X to file F in a target specific manner according to CODE.
5200    The acceptable formatting commands given by CODE are:
5201      'c':               An integer or symbol address without a preceding #
5202                         sign.
5203      'e':               Print the sign/zero-extend size as a character 8->b,
5204                         16->h, 32->w.
5205      'p':               Prints N such that 2^N == X (X must be power of 2 and
5206                         const int).
5207      'P':               Print the number of non-zero bits in X (a const_int).
5208      'H':               Print the higher numbered register of a pair (TImode)
5209                         of regs.
5210      'm':               Print a condition (eq, ne, etc).
5211      'M':               Same as 'm', but invert condition.
5212      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5213      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5214                         The register printed is the FP/SIMD register name
5215                         of X + 0/1/2/3 for S/T/U/V.
5216      'R':               Print a scalar FP/SIMD register name + 1.
5217      'X':               Print bottom 16 bits of integer constant in hex.
5218      'w/x':             Print a general register name or the zero register
5219                         (32-bit or 64-bit).
5220      '0':               Print a normal operand, if it's a general register,
5221                         then we assume DImode.
5222      'k':               Print NZCV for conditional compare instructions.
5223      'A':               Output address constant representing the first
5224                         argument of X, specifying a relocation offset
5225                         if appropriate.
5226      'L':               Output constant address specified by X
5227                         with a relocation offset if appropriate.
5228      'G':               Prints address of X, specifying a PC relative
5229                         relocation mode if appropriate.
5230      'y':               Output address of LDP or STP - this is used for
5231                         some LDP/STPs which don't use a PARALLEL in their
5232                         pattern (so the mode needs to be adjusted).
5233      'z':               Output address of a typical LDP or STP.  */
5234
5235 static void
5236 aarch64_print_operand (FILE *f, rtx x, int code)
5237 {
5238   switch (code)
5239     {
5240     case 'c':
5241       switch (GET_CODE (x))
5242         {
5243         case CONST_INT:
5244           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5245           break;
5246
5247         case SYMBOL_REF:
5248           output_addr_const (f, x);
5249           break;
5250
5251         case CONST:
5252           if (GET_CODE (XEXP (x, 0)) == PLUS
5253               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5254             {
5255               output_addr_const (f, x);
5256               break;
5257             }
5258           /* Fall through.  */
5259
5260         default:
5261           output_operand_lossage ("Unsupported operand for code '%c'", code);
5262         }
5263       break;
5264
5265     case 'e':
5266       {
5267         int n;
5268
5269         if (!CONST_INT_P (x)
5270             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5271           {
5272             output_operand_lossage ("invalid operand for '%%%c'", code);
5273             return;
5274           }
5275
5276         switch (n)
5277           {
5278           case 3:
5279             fputc ('b', f);
5280             break;
5281           case 4:
5282             fputc ('h', f);
5283             break;
5284           case 5:
5285             fputc ('w', f);
5286             break;
5287           default:
5288             output_operand_lossage ("invalid operand for '%%%c'", code);
5289             return;
5290           }
5291       }
5292       break;
5293
5294     case 'p':
5295       {
5296         int n;
5297
5298         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5299           {
5300             output_operand_lossage ("invalid operand for '%%%c'", code);
5301             return;
5302           }
5303
5304         asm_fprintf (f, "%d", n);
5305       }
5306       break;
5307
5308     case 'P':
5309       if (!CONST_INT_P (x))
5310         {
5311           output_operand_lossage ("invalid operand for '%%%c'", code);
5312           return;
5313         }
5314
5315       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5316       break;
5317
5318     case 'H':
5319       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5320         {
5321           output_operand_lossage ("invalid operand for '%%%c'", code);
5322           return;
5323         }
5324
5325       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5326       break;
5327
5328     case 'M':
5329     case 'm':
5330       {
5331         int cond_code;
5332         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5333         if (x == const_true_rtx)
5334           {
5335             if (code == 'M')
5336               fputs ("nv", f);
5337             return;
5338           }
5339
5340         if (!COMPARISON_P (x))
5341           {
5342             output_operand_lossage ("invalid operand for '%%%c'", code);
5343             return;
5344           }
5345
5346         cond_code = aarch64_get_condition_code (x);
5347         gcc_assert (cond_code >= 0);
5348         if (code == 'M')
5349           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5350         fputs (aarch64_condition_codes[cond_code], f);
5351       }
5352       break;
5353
5354     case 'b':
5355     case 'h':
5356     case 's':
5357     case 'd':
5358     case 'q':
5359       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5360         {
5361           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5362           return;
5363         }
5364       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5365       break;
5366
5367     case 'S':
5368     case 'T':
5369     case 'U':
5370     case 'V':
5371       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5372         {
5373           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5374           return;
5375         }
5376       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5377       break;
5378
5379     case 'R':
5380       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5381         {
5382           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5383           return;
5384         }
5385       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5386       break;
5387
5388     case 'X':
5389       if (!CONST_INT_P (x))
5390         {
5391           output_operand_lossage ("invalid operand for '%%%c'", code);
5392           return;
5393         }
5394       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5395       break;
5396
5397     case 'w':
5398     case 'x':
5399       if (x == const0_rtx
5400           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5401         {
5402           asm_fprintf (f, "%czr", code);
5403           break;
5404         }
5405
5406       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5407         {
5408           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5409           break;
5410         }
5411
5412       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5413         {
5414           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5415           break;
5416         }
5417
5418       /* Fall through */
5419
5420     case 0:
5421       if (x == NULL)
5422         {
5423           output_operand_lossage ("missing operand");
5424           return;
5425         }
5426
5427       switch (GET_CODE (x))
5428         {
5429         case REG:
5430           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5431           break;
5432
5433         case MEM:
5434           output_address (GET_MODE (x), XEXP (x, 0));
5435           break;
5436
5437         case CONST:
5438         case LABEL_REF:
5439         case SYMBOL_REF:
5440           output_addr_const (asm_out_file, x);
5441           break;
5442
5443         case CONST_INT:
5444           asm_fprintf (f, "%wd", INTVAL (x));
5445           break;
5446
5447         case CONST_VECTOR:
5448           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5449             {
5450               gcc_assert (
5451                   aarch64_const_vec_all_same_in_range_p (x,
5452                                                          HOST_WIDE_INT_MIN,
5453                                                          HOST_WIDE_INT_MAX));
5454               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5455             }
5456           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5457             {
5458               fputc ('0', f);
5459             }
5460           else
5461             gcc_unreachable ();
5462           break;
5463
5464         case CONST_DOUBLE:
5465           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5466              be getting CONST_DOUBLEs holding integers.  */
5467           gcc_assert (GET_MODE (x) != VOIDmode);
5468           if (aarch64_float_const_zero_rtx_p (x))
5469             {
5470               fputc ('0', f);
5471               break;
5472             }
5473           else if (aarch64_float_const_representable_p (x))
5474             {
5475 #define buf_size 20
5476               char float_buf[buf_size] = {'\0'};
5477               real_to_decimal_for_mode (float_buf,
5478                                         CONST_DOUBLE_REAL_VALUE (x),
5479                                         buf_size, buf_size,
5480                                         1, GET_MODE (x));
5481               asm_fprintf (asm_out_file, "%s", float_buf);
5482               break;
5483 #undef buf_size
5484             }
5485           output_operand_lossage ("invalid constant");
5486           return;
5487         default:
5488           output_operand_lossage ("invalid operand");
5489           return;
5490         }
5491       break;
5492
5493     case 'A':
5494       if (GET_CODE (x) == HIGH)
5495         x = XEXP (x, 0);
5496
5497       switch (aarch64_classify_symbolic_expression (x))
5498         {
5499         case SYMBOL_SMALL_GOT_4G:
5500           asm_fprintf (asm_out_file, ":got:");
5501           break;
5502
5503         case SYMBOL_SMALL_TLSGD:
5504           asm_fprintf (asm_out_file, ":tlsgd:");
5505           break;
5506
5507         case SYMBOL_SMALL_TLSDESC:
5508           asm_fprintf (asm_out_file, ":tlsdesc:");
5509           break;
5510
5511         case SYMBOL_SMALL_TLSIE:
5512           asm_fprintf (asm_out_file, ":gottprel:");
5513           break;
5514
5515         case SYMBOL_TLSLE24:
5516           asm_fprintf (asm_out_file, ":tprel:");
5517           break;
5518
5519         case SYMBOL_TINY_GOT:
5520           gcc_unreachable ();
5521           break;
5522
5523         default:
5524           break;
5525         }
5526       output_addr_const (asm_out_file, x);
5527       break;
5528
5529     case 'L':
5530       switch (aarch64_classify_symbolic_expression (x))
5531         {
5532         case SYMBOL_SMALL_GOT_4G:
5533           asm_fprintf (asm_out_file, ":lo12:");
5534           break;
5535
5536         case SYMBOL_SMALL_TLSGD:
5537           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5538           break;
5539
5540         case SYMBOL_SMALL_TLSDESC:
5541           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5542           break;
5543
5544         case SYMBOL_SMALL_TLSIE:
5545           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5546           break;
5547
5548         case SYMBOL_TLSLE12:
5549           asm_fprintf (asm_out_file, ":tprel_lo12:");
5550           break;
5551
5552         case SYMBOL_TLSLE24:
5553           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5554           break;
5555
5556         case SYMBOL_TINY_GOT:
5557           asm_fprintf (asm_out_file, ":got:");
5558           break;
5559
5560         case SYMBOL_TINY_TLSIE:
5561           asm_fprintf (asm_out_file, ":gottprel:");
5562           break;
5563
5564         default:
5565           break;
5566         }
5567       output_addr_const (asm_out_file, x);
5568       break;
5569
5570     case 'G':
5571       switch (aarch64_classify_symbolic_expression (x))
5572         {
5573         case SYMBOL_TLSLE24:
5574           asm_fprintf (asm_out_file, ":tprel_hi12:");
5575           break;
5576         default:
5577           break;
5578         }
5579       output_addr_const (asm_out_file, x);
5580       break;
5581
5582     case 'k':
5583       {
5584         HOST_WIDE_INT cond_code;
5585
5586         if (!CONST_INT_P (x))
5587           {
5588             output_operand_lossage ("invalid operand for '%%%c'", code);
5589             return;
5590           }
5591
5592         cond_code = INTVAL (x);
5593         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5594         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5595       }
5596       break;
5597
5598     case 'y':
5599     case 'z':
5600       {
5601         machine_mode mode = GET_MODE (x);
5602
5603         if (GET_CODE (x) != MEM
5604             || (code == 'y' && GET_MODE_SIZE (mode) != 16))
5605           {
5606             output_operand_lossage ("invalid operand for '%%%c'", code);
5607             return;
5608           }
5609
5610         if (code == 'y')
5611           /* LDP/STP which uses a single double-width memory operand.
5612              Adjust the mode to appear like a typical LDP/STP.
5613              Currently this is supported for 16-byte accesses only.  */
5614           mode = DFmode;
5615
5616         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
5617           output_operand_lossage ("invalid operand prefix '%%%c'", code);
5618       }
5619       break;
5620
5621     default:
5622       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5623       return;
5624     }
5625 }
5626
5627 /* Print address 'x' of a memory access with mode 'mode'.
5628    'op' is the context required by aarch64_classify_address.  It can either be
5629    MEM for a normal memory access or PARALLEL for LDP/STP.  */
5630 static bool
5631 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x, RTX_CODE op)
5632 {
5633   struct aarch64_address_info addr;
5634
5635   /* Check all addresses are Pmode - including ILP32.  */
5636   gcc_assert (GET_MODE (x) == Pmode);
5637
5638   if (aarch64_classify_address (&addr, x, mode, op, true))
5639     switch (addr.type)
5640       {
5641       case ADDRESS_REG_IMM:
5642         if (addr.offset == const0_rtx)
5643           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5644         else
5645           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5646                        INTVAL (addr.offset));
5647         return true;
5648
5649       case ADDRESS_REG_REG:
5650         if (addr.shift == 0)
5651           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5652                        reg_names [REGNO (addr.offset)]);
5653         else
5654           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5655                        reg_names [REGNO (addr.offset)], addr.shift);
5656         return true;
5657
5658       case ADDRESS_REG_UXTW:
5659         if (addr.shift == 0)
5660           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5661                        REGNO (addr.offset) - R0_REGNUM);
5662         else
5663           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5664                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5665         return true;
5666
5667       case ADDRESS_REG_SXTW:
5668         if (addr.shift == 0)
5669           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5670                        REGNO (addr.offset) - R0_REGNUM);
5671         else
5672           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5673                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5674         return true;
5675
5676       case ADDRESS_REG_WB:
5677         switch (GET_CODE (x))
5678           {
5679           case PRE_INC:
5680             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5681                          GET_MODE_SIZE (mode));
5682             return true;
5683           case POST_INC:
5684             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5685                          GET_MODE_SIZE (mode));
5686             return true;
5687           case PRE_DEC:
5688             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5689                          GET_MODE_SIZE (mode));
5690             return true;
5691           case POST_DEC:
5692             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5693                          GET_MODE_SIZE (mode));
5694             return true;
5695           case PRE_MODIFY:
5696             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5697                          INTVAL (addr.offset));
5698             return true;
5699           case POST_MODIFY:
5700             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5701                          INTVAL (addr.offset));
5702             return true;
5703           default:
5704             break;
5705           }
5706         break;
5707
5708       case ADDRESS_LO_SUM:
5709         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5710         output_addr_const (f, addr.offset);
5711         asm_fprintf (f, "]");
5712         return true;
5713
5714       case ADDRESS_SYMBOLIC:
5715         output_addr_const (f, x);
5716         return true;
5717       }
5718
5719   return false;
5720 }
5721
5722 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
5723 static bool
5724 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
5725 {
5726   return aarch64_print_address_internal (f, mode, x, PARALLEL);
5727 }
5728
5729 /* Print address 'x' of a memory access with mode 'mode'.  */
5730 static void
5731 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5732 {
5733   if (!aarch64_print_address_internal (f, mode, x, MEM))
5734     output_addr_const (f, x);
5735 }
5736
5737 bool
5738 aarch64_label_mentioned_p (rtx x)
5739 {
5740   const char *fmt;
5741   int i;
5742
5743   if (GET_CODE (x) == LABEL_REF)
5744     return true;
5745
5746   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5747      referencing instruction, but they are constant offsets, not
5748      symbols.  */
5749   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5750     return false;
5751
5752   fmt = GET_RTX_FORMAT (GET_CODE (x));
5753   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5754     {
5755       if (fmt[i] == 'E')
5756         {
5757           int j;
5758
5759           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5760             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5761               return 1;
5762         }
5763       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5764         return 1;
5765     }
5766
5767   return 0;
5768 }
5769
5770 /* Implement REGNO_REG_CLASS.  */
5771
5772 enum reg_class
5773 aarch64_regno_regclass (unsigned regno)
5774 {
5775   if (GP_REGNUM_P (regno))
5776     return GENERAL_REGS;
5777
5778   if (regno == SP_REGNUM)
5779     return STACK_REG;
5780
5781   if (regno == FRAME_POINTER_REGNUM
5782       || regno == ARG_POINTER_REGNUM)
5783     return POINTER_REGS;
5784
5785   if (FP_REGNUM_P (regno))
5786     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5787
5788   return NO_REGS;
5789 }
5790
5791 static rtx
5792 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5793 {
5794   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5795      where mask is selected by alignment and size of the offset.
5796      We try to pick as large a range for the offset as possible to
5797      maximize the chance of a CSE.  However, for aligned addresses
5798      we limit the range to 4k so that structures with different sized
5799      elements are likely to use the same base.  We need to be careful
5800      not to split a CONST for some forms of address expression, otherwise
5801      it will generate sub-optimal code.  */
5802
5803   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5804     {
5805       rtx base = XEXP (x, 0);
5806       rtx offset_rtx = XEXP (x, 1);
5807       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5808
5809       if (GET_CODE (base) == PLUS)
5810         {
5811           rtx op0 = XEXP (base, 0);
5812           rtx op1 = XEXP (base, 1);
5813
5814           /* Force any scaling into a temp for CSE.  */
5815           op0 = force_reg (Pmode, op0);
5816           op1 = force_reg (Pmode, op1);
5817
5818           /* Let the pointer register be in op0.  */
5819           if (REG_POINTER (op1))
5820             std::swap (op0, op1);
5821
5822           /* If the pointer is virtual or frame related, then we know that
5823              virtual register instantiation or register elimination is going
5824              to apply a second constant.  We want the two constants folded
5825              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5826           if (virt_or_elim_regno_p (REGNO (op0)))
5827             {
5828               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5829                                    NULL_RTX, true, OPTAB_DIRECT);
5830               return gen_rtx_PLUS (Pmode, base, op1);
5831             }
5832
5833           /* Otherwise, in order to encourage CSE (and thence loop strength
5834              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5835           base = expand_binop (Pmode, add_optab, op0, op1,
5836                                NULL_RTX, true, OPTAB_DIRECT);
5837           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5838         }
5839
5840       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5841       HOST_WIDE_INT base_offset;
5842       if (GET_MODE_SIZE (mode) > 16)
5843         base_offset = (offset + 0x400) & ~0x7f0;
5844       /* For offsets aren't a multiple of the access size, the limit is
5845          -256...255.  */
5846       else if (offset & (GET_MODE_SIZE (mode) - 1))
5847         {
5848           base_offset = (offset + 0x100) & ~0x1ff;
5849
5850           /* BLKmode typically uses LDP of X-registers.  */
5851           if (mode == BLKmode)
5852             base_offset = (offset + 512) & ~0x3ff;
5853         }
5854       /* Small negative offsets are supported.  */
5855       else if (IN_RANGE (offset, -256, 0))
5856         base_offset = 0;
5857       else if (mode == TImode || mode == TFmode)
5858         base_offset = (offset + 0x100) & ~0x1ff;
5859       /* Use 12-bit offset by access size.  */
5860       else
5861         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5862
5863       if (base_offset != 0)
5864         {
5865           base = plus_constant (Pmode, base, base_offset);
5866           base = force_operand (base, NULL_RTX);
5867           return plus_constant (Pmode, base, offset - base_offset);
5868         }
5869     }
5870
5871   return x;
5872 }
5873
5874 /* Return the reload icode required for a constant pool in mode.  */
5875 static enum insn_code
5876 aarch64_constant_pool_reload_icode (machine_mode mode)
5877 {
5878   switch (mode)
5879     {
5880     case E_SFmode:
5881       return CODE_FOR_aarch64_reload_movcpsfdi;
5882
5883     case E_DFmode:
5884       return CODE_FOR_aarch64_reload_movcpdfdi;
5885
5886     case E_TFmode:
5887       return CODE_FOR_aarch64_reload_movcptfdi;
5888
5889     case E_V8QImode:
5890       return CODE_FOR_aarch64_reload_movcpv8qidi;
5891
5892     case E_V16QImode:
5893       return CODE_FOR_aarch64_reload_movcpv16qidi;
5894
5895     case E_V4HImode:
5896       return CODE_FOR_aarch64_reload_movcpv4hidi;
5897
5898     case E_V8HImode:
5899       return CODE_FOR_aarch64_reload_movcpv8hidi;
5900
5901     case E_V2SImode:
5902       return CODE_FOR_aarch64_reload_movcpv2sidi;
5903
5904     case E_V4SImode:
5905       return CODE_FOR_aarch64_reload_movcpv4sidi;
5906
5907     case E_V2DImode:
5908       return CODE_FOR_aarch64_reload_movcpv2didi;
5909
5910     case E_V2DFmode:
5911       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5912
5913     default:
5914       gcc_unreachable ();
5915     }
5916
5917   gcc_unreachable ();
5918 }
5919 static reg_class_t
5920 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5921                           reg_class_t rclass,
5922                           machine_mode mode,
5923                           secondary_reload_info *sri)
5924 {
5925
5926   /* If we have to disable direct literal pool loads and stores because the
5927      function is too big, then we need a scratch register.  */
5928   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5929       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5930           || targetm.vector_mode_supported_p (GET_MODE (x)))
5931       && !aarch64_pcrelative_literal_loads)
5932     {
5933       sri->icode = aarch64_constant_pool_reload_icode (mode);
5934       return NO_REGS;
5935     }
5936
5937   /* Without the TARGET_SIMD instructions we cannot move a Q register
5938      to a Q register directly.  We need a scratch.  */
5939   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5940       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5941       && reg_class_subset_p (rclass, FP_REGS))
5942     {
5943       if (mode == TFmode)
5944         sri->icode = CODE_FOR_aarch64_reload_movtf;
5945       else if (mode == TImode)
5946         sri->icode = CODE_FOR_aarch64_reload_movti;
5947       return NO_REGS;
5948     }
5949
5950   /* A TFmode or TImode memory access should be handled via an FP_REGS
5951      because AArch64 has richer addressing modes for LDR/STR instructions
5952      than LDP/STP instructions.  */
5953   if (TARGET_FLOAT && rclass == GENERAL_REGS
5954       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5955     return FP_REGS;
5956
5957   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5958       return GENERAL_REGS;
5959
5960   return NO_REGS;
5961 }
5962
5963 static bool
5964 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
5965 {
5966   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
5967
5968   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
5969      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
5970   if (frame_pointer_needed)
5971     return to == HARD_FRAME_POINTER_REGNUM;
5972   return true;
5973 }
5974
5975 HOST_WIDE_INT
5976 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5977 {
5978   aarch64_layout_frame ();
5979
5980   if (to == HARD_FRAME_POINTER_REGNUM)
5981     {
5982       if (from == ARG_POINTER_REGNUM)
5983         return cfun->machine->frame.hard_fp_offset;
5984
5985       if (from == FRAME_POINTER_REGNUM)
5986         return cfun->machine->frame.hard_fp_offset
5987                - cfun->machine->frame.locals_offset;
5988     }
5989
5990   if (to == STACK_POINTER_REGNUM)
5991     {
5992       if (from == FRAME_POINTER_REGNUM)
5993           return cfun->machine->frame.frame_size
5994                  - cfun->machine->frame.locals_offset;
5995     }
5996
5997   return cfun->machine->frame.frame_size;
5998 }
5999
6000 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
6001    previous frame.  */
6002
6003 rtx
6004 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
6005 {
6006   if (count != 0)
6007     return const0_rtx;
6008   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
6009 }
6010
6011
6012 static void
6013 aarch64_asm_trampoline_template (FILE *f)
6014 {
6015   if (TARGET_ILP32)
6016     {
6017       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
6018       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
6019     }
6020   else
6021     {
6022       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
6023       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
6024     }
6025   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
6026   assemble_aligned_integer (4, const0_rtx);
6027   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
6028   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
6029 }
6030
6031 static void
6032 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
6033 {
6034   rtx fnaddr, mem, a_tramp;
6035   const int tramp_code_sz = 16;
6036
6037   /* Don't need to copy the trailing D-words, we fill those in below.  */
6038   emit_block_move (m_tramp, assemble_trampoline_template (),
6039                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
6040   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
6041   fnaddr = XEXP (DECL_RTL (fndecl), 0);
6042   if (GET_MODE (fnaddr) != ptr_mode)
6043     fnaddr = convert_memory_address (ptr_mode, fnaddr);
6044   emit_move_insn (mem, fnaddr);
6045
6046   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
6047   emit_move_insn (mem, chain_value);
6048
6049   /* XXX We should really define a "clear_cache" pattern and use
6050      gen_clear_cache().  */
6051   a_tramp = XEXP (m_tramp, 0);
6052   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
6053                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
6054                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6055                      ptr_mode);
6056 }
6057
6058 static unsigned char
6059 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6060 {
6061   switch (regclass)
6062     {
6063     case CALLER_SAVE_REGS:
6064     case POINTER_REGS:
6065     case GENERAL_REGS:
6066     case ALL_REGS:
6067     case POINTER_AND_FP_REGS:
6068     case FP_REGS:
6069     case FP_LO_REGS:
6070       return
6071         aarch64_vector_mode_p (mode)
6072           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6073           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6074     case STACK_REG:
6075       return 1;
6076
6077     case NO_REGS:
6078       return 0;
6079
6080     default:
6081       break;
6082     }
6083   gcc_unreachable ();
6084 }
6085
6086 static reg_class_t
6087 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6088 {
6089   if (regclass == POINTER_REGS)
6090     return GENERAL_REGS;
6091
6092   if (regclass == STACK_REG)
6093     {
6094       if (REG_P(x)
6095           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6096           return regclass;
6097
6098       return NO_REGS;
6099     }
6100
6101   /* Register eliminiation can result in a request for
6102      SP+constant->FP_REGS.  We cannot support such operations which
6103      use SP as source and an FP_REG as destination, so reject out
6104      right now.  */
6105   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6106     {
6107       rtx lhs = XEXP (x, 0);
6108
6109       /* Look through a possible SUBREG introduced by ILP32.  */
6110       if (GET_CODE (lhs) == SUBREG)
6111         lhs = SUBREG_REG (lhs);
6112
6113       gcc_assert (REG_P (lhs));
6114       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6115                                       POINTER_REGS));
6116       return NO_REGS;
6117     }
6118
6119   return regclass;
6120 }
6121
6122 void
6123 aarch64_asm_output_labelref (FILE* f, const char *name)
6124 {
6125   asm_fprintf (f, "%U%s", name);
6126 }
6127
6128 static void
6129 aarch64_elf_asm_constructor (rtx symbol, int priority)
6130 {
6131   if (priority == DEFAULT_INIT_PRIORITY)
6132     default_ctor_section_asm_out_constructor (symbol, priority);
6133   else
6134     {
6135       section *s;
6136       /* While priority is known to be in range [0, 65535], so 18 bytes
6137          would be enough, the compiler might not know that.  To avoid
6138          -Wformat-truncation false positive, use a larger size.  */
6139       char buf[23];
6140       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6141       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6142       switch_to_section (s);
6143       assemble_align (POINTER_SIZE);
6144       assemble_aligned_integer (POINTER_BYTES, symbol);
6145     }
6146 }
6147
6148 static void
6149 aarch64_elf_asm_destructor (rtx symbol, int priority)
6150 {
6151   if (priority == DEFAULT_INIT_PRIORITY)
6152     default_dtor_section_asm_out_destructor (symbol, priority);
6153   else
6154     {
6155       section *s;
6156       /* While priority is known to be in range [0, 65535], so 18 bytes
6157          would be enough, the compiler might not know that.  To avoid
6158          -Wformat-truncation false positive, use a larger size.  */
6159       char buf[23];
6160       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6161       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6162       switch_to_section (s);
6163       assemble_align (POINTER_SIZE);
6164       assemble_aligned_integer (POINTER_BYTES, symbol);
6165     }
6166 }
6167
6168 const char*
6169 aarch64_output_casesi (rtx *operands)
6170 {
6171   char buf[100];
6172   char label[100];
6173   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6174   int index;
6175   static const char *const patterns[4][2] =
6176   {
6177     {
6178       "ldrb\t%w3, [%0,%w1,uxtw]",
6179       "add\t%3, %4, %w3, sxtb #2"
6180     },
6181     {
6182       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6183       "add\t%3, %4, %w3, sxth #2"
6184     },
6185     {
6186       "ldr\t%w3, [%0,%w1,uxtw #2]",
6187       "add\t%3, %4, %w3, sxtw #2"
6188     },
6189     /* We assume that DImode is only generated when not optimizing and
6190        that we don't really need 64-bit address offsets.  That would
6191        imply an object file with 8GB of code in a single function!  */
6192     {
6193       "ldr\t%w3, [%0,%w1,uxtw #2]",
6194       "add\t%3, %4, %w3, sxtw #2"
6195     }
6196   };
6197
6198   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6199
6200   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6201   index = exact_log2 (GET_MODE_SIZE (mode));
6202
6203   gcc_assert (index >= 0 && index <= 3);
6204
6205   /* Need to implement table size reduction, by chaning the code below.  */
6206   output_asm_insn (patterns[index][0], operands);
6207   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6208   snprintf (buf, sizeof (buf),
6209             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6210   output_asm_insn (buf, operands);
6211   output_asm_insn (patterns[index][1], operands);
6212   output_asm_insn ("br\t%3", operands);
6213   assemble_label (asm_out_file, label);
6214   return "";
6215 }
6216
6217
6218 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6219    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6220    operator.  */
6221
6222 int
6223 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6224 {
6225   if (shift >= 0 && shift <= 3)
6226     {
6227       int size;
6228       for (size = 8; size <= 32; size *= 2)
6229         {
6230           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6231           if (mask == bits << shift)
6232             return size;
6233         }
6234     }
6235   return 0;
6236 }
6237
6238 /* Constant pools are per function only when PC relative
6239    literal loads are true or we are in the large memory
6240    model.  */
6241
6242 static inline bool
6243 aarch64_can_use_per_function_literal_pools_p (void)
6244 {
6245   return (aarch64_pcrelative_literal_loads
6246           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6247 }
6248
6249 static bool
6250 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6251 {
6252   /* Fixme:: In an ideal world this would work similar
6253      to the logic in aarch64_select_rtx_section but this
6254      breaks bootstrap in gcc go.  For now we workaround
6255      this by returning false here.  */
6256   return false;
6257 }
6258
6259 /* Select appropriate section for constants depending
6260    on where we place literal pools.  */
6261
6262 static section *
6263 aarch64_select_rtx_section (machine_mode mode,
6264                             rtx x,
6265                             unsigned HOST_WIDE_INT align)
6266 {
6267   if (aarch64_can_use_per_function_literal_pools_p ())
6268     return function_section (current_function_decl);
6269
6270   return default_elf_select_rtx_section (mode, x, align);
6271 }
6272
6273 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6274 void
6275 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6276                                   HOST_WIDE_INT offset)
6277 {
6278   /* When using per-function literal pools, we must ensure that any code
6279      section is aligned to the minimal instruction length, lest we get
6280      errors from the assembler re "unaligned instructions".  */
6281   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6282     ASM_OUTPUT_ALIGN (f, 2);
6283 }
6284
6285 /* Costs.  */
6286
6287 /* Helper function for rtx cost calculation.  Strip a shift expression
6288    from X.  Returns the inner operand if successful, or the original
6289    expression on failure.  */
6290 static rtx
6291 aarch64_strip_shift (rtx x)
6292 {
6293   rtx op = x;
6294
6295   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6296      we can convert both to ROR during final output.  */
6297   if ((GET_CODE (op) == ASHIFT
6298        || GET_CODE (op) == ASHIFTRT
6299        || GET_CODE (op) == LSHIFTRT
6300        || GET_CODE (op) == ROTATERT
6301        || GET_CODE (op) == ROTATE)
6302       && CONST_INT_P (XEXP (op, 1)))
6303     return XEXP (op, 0);
6304
6305   if (GET_CODE (op) == MULT
6306       && CONST_INT_P (XEXP (op, 1))
6307       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6308     return XEXP (op, 0);
6309
6310   return x;
6311 }
6312
6313 /* Helper function for rtx cost calculation.  Strip an extend
6314    expression from X.  Returns the inner operand if successful, or the
6315    original expression on failure.  We deal with a number of possible
6316    canonicalization variations here. If STRIP_SHIFT is true, then
6317    we can strip off a shift also.  */
6318 static rtx
6319 aarch64_strip_extend (rtx x, bool strip_shift)
6320 {
6321   scalar_int_mode mode;
6322   rtx op = x;
6323
6324   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6325     return op;
6326
6327   /* Zero and sign extraction of a widened value.  */
6328   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6329       && XEXP (op, 2) == const0_rtx
6330       && GET_CODE (XEXP (op, 0)) == MULT
6331       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6332                                          XEXP (op, 1)))
6333     return XEXP (XEXP (op, 0), 0);
6334
6335   /* It can also be represented (for zero-extend) as an AND with an
6336      immediate.  */
6337   if (GET_CODE (op) == AND
6338       && GET_CODE (XEXP (op, 0)) == MULT
6339       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6340       && CONST_INT_P (XEXP (op, 1))
6341       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6342                            INTVAL (XEXP (op, 1))) != 0)
6343     return XEXP (XEXP (op, 0), 0);
6344
6345   /* Now handle extended register, as this may also have an optional
6346      left shift by 1..4.  */
6347   if (strip_shift
6348       && GET_CODE (op) == ASHIFT
6349       && CONST_INT_P (XEXP (op, 1))
6350       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6351     op = XEXP (op, 0);
6352
6353   if (GET_CODE (op) == ZERO_EXTEND
6354       || GET_CODE (op) == SIGN_EXTEND)
6355     op = XEXP (op, 0);
6356
6357   if (op != x)
6358     return op;
6359
6360   return x;
6361 }
6362
6363 /* Return true iff CODE is a shift supported in combination
6364    with arithmetic instructions.  */
6365
6366 static bool
6367 aarch64_shift_p (enum rtx_code code)
6368 {
6369   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6370 }
6371
6372
6373 /* Return true iff X is a cheap shift without a sign extend. */
6374
6375 static bool
6376 aarch64_cheap_mult_shift_p (rtx x)
6377 {
6378   rtx op0, op1;
6379
6380   op0 = XEXP (x, 0);
6381   op1 = XEXP (x, 1);
6382
6383   if (!(aarch64_tune_params.extra_tuning_flags
6384                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6385     return false;
6386
6387   if (GET_CODE (op0) == SIGN_EXTEND)
6388     return false;
6389
6390   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6391       && UINTVAL (op1) <= 4)
6392     return true;
6393
6394   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6395     return false;
6396
6397   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6398
6399   if (l2 > 0 && l2 <= 4)
6400     return true;
6401
6402   return false;
6403 }
6404
6405 /* Helper function for rtx cost calculation.  Calculate the cost of
6406    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6407    Return the calculated cost of the expression, recursing manually in to
6408    operands where needed.  */
6409
6410 static int
6411 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6412 {
6413   rtx op0, op1;
6414   const struct cpu_cost_table *extra_cost
6415     = aarch64_tune_params.insn_extra_cost;
6416   int cost = 0;
6417   bool compound_p = (outer == PLUS || outer == MINUS);
6418   machine_mode mode = GET_MODE (x);
6419
6420   gcc_checking_assert (code == MULT);
6421
6422   op0 = XEXP (x, 0);
6423   op1 = XEXP (x, 1);
6424
6425   if (VECTOR_MODE_P (mode))
6426     mode = GET_MODE_INNER (mode);
6427
6428   /* Integer multiply/fma.  */
6429   if (GET_MODE_CLASS (mode) == MODE_INT)
6430     {
6431       /* The multiply will be canonicalized as a shift, cost it as such.  */
6432       if (aarch64_shift_p (GET_CODE (x))
6433           || (CONST_INT_P (op1)
6434               && exact_log2 (INTVAL (op1)) > 0))
6435         {
6436           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6437                            || GET_CODE (op0) == SIGN_EXTEND;
6438           if (speed)
6439             {
6440               if (compound_p)
6441                 {
6442                   /* If the shift is considered cheap,
6443                      then don't add any cost. */
6444                   if (aarch64_cheap_mult_shift_p (x))
6445                     ;
6446                   else if (REG_P (op1))
6447                     /* ARITH + shift-by-register.  */
6448                     cost += extra_cost->alu.arith_shift_reg;
6449                   else if (is_extend)
6450                     /* ARITH + extended register.  We don't have a cost field
6451                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6452                     cost += extra_cost->alu.extend_arith;
6453                   else
6454                     /* ARITH + shift-by-immediate.  */
6455                     cost += extra_cost->alu.arith_shift;
6456                 }
6457               else
6458                 /* LSL (immediate).  */
6459                 cost += extra_cost->alu.shift;
6460
6461             }
6462           /* Strip extends as we will have costed them in the case above.  */
6463           if (is_extend)
6464             op0 = aarch64_strip_extend (op0, true);
6465
6466           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6467
6468           return cost;
6469         }
6470
6471       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6472          compound and let the below cases handle it.  After all, MNEG is a
6473          special-case alias of MSUB.  */
6474       if (GET_CODE (op0) == NEG)
6475         {
6476           op0 = XEXP (op0, 0);
6477           compound_p = true;
6478         }
6479
6480       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6481       if ((GET_CODE (op0) == ZERO_EXTEND
6482            && GET_CODE (op1) == ZERO_EXTEND)
6483           || (GET_CODE (op0) == SIGN_EXTEND
6484               && GET_CODE (op1) == SIGN_EXTEND))
6485         {
6486           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6487           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6488
6489           if (speed)
6490             {
6491               if (compound_p)
6492                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6493                 cost += extra_cost->mult[0].extend_add;
6494               else
6495                 /* MUL/SMULL/UMULL.  */
6496                 cost += extra_cost->mult[0].extend;
6497             }
6498
6499           return cost;
6500         }
6501
6502       /* This is either an integer multiply or a MADD.  In both cases
6503          we want to recurse and cost the operands.  */
6504       cost += rtx_cost (op0, mode, MULT, 0, speed);
6505       cost += rtx_cost (op1, mode, MULT, 1, speed);
6506
6507       if (speed)
6508         {
6509           if (compound_p)
6510             /* MADD/MSUB.  */
6511             cost += extra_cost->mult[mode == DImode].add;
6512           else
6513             /* MUL.  */
6514             cost += extra_cost->mult[mode == DImode].simple;
6515         }
6516
6517       return cost;
6518     }
6519   else
6520     {
6521       if (speed)
6522         {
6523           /* Floating-point FMA/FMUL can also support negations of the
6524              operands, unless the rounding mode is upward or downward in
6525              which case FNMUL is different than FMUL with operand negation.  */
6526           bool neg0 = GET_CODE (op0) == NEG;
6527           bool neg1 = GET_CODE (op1) == NEG;
6528           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6529             {
6530               if (neg0)
6531                 op0 = XEXP (op0, 0);
6532               if (neg1)
6533                 op1 = XEXP (op1, 0);
6534             }
6535
6536           if (compound_p)
6537             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6538             cost += extra_cost->fp[mode == DFmode].fma;
6539           else
6540             /* FMUL/FNMUL.  */
6541             cost += extra_cost->fp[mode == DFmode].mult;
6542         }
6543
6544       cost += rtx_cost (op0, mode, MULT, 0, speed);
6545       cost += rtx_cost (op1, mode, MULT, 1, speed);
6546       return cost;
6547     }
6548 }
6549
6550 static int
6551 aarch64_address_cost (rtx x,
6552                       machine_mode mode,
6553                       addr_space_t as ATTRIBUTE_UNUSED,
6554                       bool speed)
6555 {
6556   enum rtx_code c = GET_CODE (x);
6557   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6558   struct aarch64_address_info info;
6559   int cost = 0;
6560   info.shift = 0;
6561
6562   if (!aarch64_classify_address (&info, x, mode, c, false))
6563     {
6564       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6565         {
6566           /* This is a CONST or SYMBOL ref which will be split
6567              in a different way depending on the code model in use.
6568              Cost it through the generic infrastructure.  */
6569           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6570           /* Divide through by the cost of one instruction to
6571              bring it to the same units as the address costs.  */
6572           cost_symbol_ref /= COSTS_N_INSNS (1);
6573           /* The cost is then the cost of preparing the address,
6574              followed by an immediate (possibly 0) offset.  */
6575           return cost_symbol_ref + addr_cost->imm_offset;
6576         }
6577       else
6578         {
6579           /* This is most likely a jump table from a case
6580              statement.  */
6581           return addr_cost->register_offset;
6582         }
6583     }
6584
6585   switch (info.type)
6586     {
6587       case ADDRESS_LO_SUM:
6588       case ADDRESS_SYMBOLIC:
6589       case ADDRESS_REG_IMM:
6590         cost += addr_cost->imm_offset;
6591         break;
6592
6593       case ADDRESS_REG_WB:
6594         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6595           cost += addr_cost->pre_modify;
6596         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6597           cost += addr_cost->post_modify;
6598         else
6599           gcc_unreachable ();
6600
6601         break;
6602
6603       case ADDRESS_REG_REG:
6604         cost += addr_cost->register_offset;
6605         break;
6606
6607       case ADDRESS_REG_SXTW:
6608         cost += addr_cost->register_sextend;
6609         break;
6610
6611       case ADDRESS_REG_UXTW:
6612         cost += addr_cost->register_zextend;
6613         break;
6614
6615       default:
6616         gcc_unreachable ();
6617     }
6618
6619
6620   if (info.shift > 0)
6621     {
6622       /* For the sake of calculating the cost of the shifted register
6623          component, we can treat same sized modes in the same way.  */
6624       switch (GET_MODE_BITSIZE (mode))
6625         {
6626           case 16:
6627             cost += addr_cost->addr_scale_costs.hi;
6628             break;
6629
6630           case 32:
6631             cost += addr_cost->addr_scale_costs.si;
6632             break;
6633
6634           case 64:
6635             cost += addr_cost->addr_scale_costs.di;
6636             break;
6637
6638           /* We can't tell, or this is a 128-bit vector.  */
6639           default:
6640             cost += addr_cost->addr_scale_costs.ti;
6641             break;
6642         }
6643     }
6644
6645   return cost;
6646 }
6647
6648 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6649    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6650    to be taken.  */
6651
6652 int
6653 aarch64_branch_cost (bool speed_p, bool predictable_p)
6654 {
6655   /* When optimizing for speed, use the cost of unpredictable branches.  */
6656   const struct cpu_branch_cost *branch_costs =
6657     aarch64_tune_params.branch_costs;
6658
6659   if (!speed_p || predictable_p)
6660     return branch_costs->predictable;
6661   else
6662     return branch_costs->unpredictable;
6663 }
6664
6665 /* Return true if the RTX X in mode MODE is a zero or sign extract
6666    usable in an ADD or SUB (extended register) instruction.  */
6667 static bool
6668 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6669 {
6670   /* Catch add with a sign extract.
6671      This is add_<optab><mode>_multp2.  */
6672   if (GET_CODE (x) == SIGN_EXTRACT
6673       || GET_CODE (x) == ZERO_EXTRACT)
6674     {
6675       rtx op0 = XEXP (x, 0);
6676       rtx op1 = XEXP (x, 1);
6677       rtx op2 = XEXP (x, 2);
6678
6679       if (GET_CODE (op0) == MULT
6680           && CONST_INT_P (op1)
6681           && op2 == const0_rtx
6682           && CONST_INT_P (XEXP (op0, 1))
6683           && aarch64_is_extend_from_extract (mode,
6684                                              XEXP (op0, 1),
6685                                              op1))
6686         {
6687           return true;
6688         }
6689     }
6690   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6691      No shift.  */
6692   else if (GET_CODE (x) == SIGN_EXTEND
6693            || GET_CODE (x) == ZERO_EXTEND)
6694     return REG_P (XEXP (x, 0));
6695
6696   return false;
6697 }
6698
6699 static bool
6700 aarch64_frint_unspec_p (unsigned int u)
6701 {
6702   switch (u)
6703     {
6704       case UNSPEC_FRINTZ:
6705       case UNSPEC_FRINTP:
6706       case UNSPEC_FRINTM:
6707       case UNSPEC_FRINTA:
6708       case UNSPEC_FRINTN:
6709       case UNSPEC_FRINTX:
6710       case UNSPEC_FRINTI:
6711         return true;
6712
6713       default:
6714         return false;
6715     }
6716 }
6717
6718 /* Return true iff X is an rtx that will match an extr instruction
6719    i.e. as described in the *extr<mode>5_insn family of patterns.
6720    OP0 and OP1 will be set to the operands of the shifts involved
6721    on success and will be NULL_RTX otherwise.  */
6722
6723 static bool
6724 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6725 {
6726   rtx op0, op1;
6727   scalar_int_mode mode;
6728   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6729     return false;
6730
6731   *res_op0 = NULL_RTX;
6732   *res_op1 = NULL_RTX;
6733
6734   if (GET_CODE (x) != IOR)
6735     return false;
6736
6737   op0 = XEXP (x, 0);
6738   op1 = XEXP (x, 1);
6739
6740   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6741       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6742     {
6743      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6744       if (GET_CODE (op1) == ASHIFT)
6745         std::swap (op0, op1);
6746
6747       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6748         return false;
6749
6750       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6751       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6752
6753       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6754           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6755         {
6756           *res_op0 = XEXP (op0, 0);
6757           *res_op1 = XEXP (op1, 0);
6758           return true;
6759         }
6760     }
6761
6762   return false;
6763 }
6764
6765 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6766    storing it in *COST.  Result is true if the total cost of the operation
6767    has now been calculated.  */
6768 static bool
6769 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6770 {
6771   rtx inner;
6772   rtx comparator;
6773   enum rtx_code cmpcode;
6774
6775   if (COMPARISON_P (op0))
6776     {
6777       inner = XEXP (op0, 0);
6778       comparator = XEXP (op0, 1);
6779       cmpcode = GET_CODE (op0);
6780     }
6781   else
6782     {
6783       inner = op0;
6784       comparator = const0_rtx;
6785       cmpcode = NE;
6786     }
6787
6788   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6789     {
6790       /* Conditional branch.  */
6791       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6792         return true;
6793       else
6794         {
6795           if (cmpcode == NE || cmpcode == EQ)
6796             {
6797               if (comparator == const0_rtx)
6798                 {
6799                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6800                   if (GET_CODE (inner) == ZERO_EXTRACT)
6801                     /* TBZ/TBNZ.  */
6802                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6803                                        ZERO_EXTRACT, 0, speed);
6804                   else
6805                     /* CBZ/CBNZ.  */
6806                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6807
6808                 return true;
6809               }
6810             }
6811           else if (cmpcode == LT || cmpcode == GE)
6812             {
6813               /* TBZ/TBNZ.  */
6814               if (comparator == const0_rtx)
6815                 return true;
6816             }
6817         }
6818     }
6819   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6820     {
6821       /* CCMP.  */
6822       if (GET_CODE (op1) == COMPARE)
6823         {
6824           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6825           if (XEXP (op1, 1) == const0_rtx)
6826             *cost += 1;
6827           if (speed)
6828             {
6829               machine_mode mode = GET_MODE (XEXP (op1, 0));
6830               const struct cpu_cost_table *extra_cost
6831                 = aarch64_tune_params.insn_extra_cost;
6832
6833               if (GET_MODE_CLASS (mode) == MODE_INT)
6834                 *cost += extra_cost->alu.arith;
6835               else
6836                 *cost += extra_cost->fp[mode == DFmode].compare;
6837             }
6838           return true;
6839         }
6840
6841       /* It's a conditional operation based on the status flags,
6842          so it must be some flavor of CSEL.  */
6843
6844       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6845       if (GET_CODE (op1) == NEG
6846           || GET_CODE (op1) == NOT
6847           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6848         op1 = XEXP (op1, 0);
6849       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6850         {
6851           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6852           op1 = XEXP (op1, 0);
6853           op2 = XEXP (op2, 0);
6854         }
6855
6856       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6857       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6858       return true;
6859     }
6860
6861   /* We don't know what this is, cost all operands.  */
6862   return false;
6863 }
6864
6865 /* Check whether X is a bitfield operation of the form shift + extend that
6866    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6867    operand to which the bitfield operation is applied.  Otherwise return
6868    NULL_RTX.  */
6869
6870 static rtx
6871 aarch64_extend_bitfield_pattern_p (rtx x)
6872 {
6873   rtx_code outer_code = GET_CODE (x);
6874   machine_mode outer_mode = GET_MODE (x);
6875
6876   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6877       && outer_mode != SImode && outer_mode != DImode)
6878     return NULL_RTX;
6879
6880   rtx inner = XEXP (x, 0);
6881   rtx_code inner_code = GET_CODE (inner);
6882   machine_mode inner_mode = GET_MODE (inner);
6883   rtx op = NULL_RTX;
6884
6885   switch (inner_code)
6886     {
6887       case ASHIFT:
6888         if (CONST_INT_P (XEXP (inner, 1))
6889             && (inner_mode == QImode || inner_mode == HImode))
6890           op = XEXP (inner, 0);
6891         break;
6892       case LSHIFTRT:
6893         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6894             && (inner_mode == QImode || inner_mode == HImode))
6895           op = XEXP (inner, 0);
6896         break;
6897       case ASHIFTRT:
6898         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6899             && (inner_mode == QImode || inner_mode == HImode))
6900           op = XEXP (inner, 0);
6901         break;
6902       default:
6903         break;
6904     }
6905
6906   return op;
6907 }
6908
6909 /* Return true if the mask and a shift amount from an RTX of the form
6910    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6911    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6912
6913 bool
6914 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6915                                     rtx shft_amnt)
6916 {
6917   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6918          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6919          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6920          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6921 }
6922
6923 /* Calculate the cost of calculating X, storing it in *COST.  Result
6924    is true if the total cost of the operation has now been calculated.  */
6925 static bool
6926 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6927                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6928 {
6929   rtx op0, op1, op2;
6930   const struct cpu_cost_table *extra_cost
6931     = aarch64_tune_params.insn_extra_cost;
6932   int code = GET_CODE (x);
6933   scalar_int_mode int_mode;
6934
6935   /* By default, assume that everything has equivalent cost to the
6936      cheapest instruction.  Any additional costs are applied as a delta
6937      above this default.  */
6938   *cost = COSTS_N_INSNS (1);
6939
6940   switch (code)
6941     {
6942     case SET:
6943       /* The cost depends entirely on the operands to SET.  */
6944       *cost = 0;
6945       op0 = SET_DEST (x);
6946       op1 = SET_SRC (x);
6947
6948       switch (GET_CODE (op0))
6949         {
6950         case MEM:
6951           if (speed)
6952             {
6953               rtx address = XEXP (op0, 0);
6954               if (VECTOR_MODE_P (mode))
6955                 *cost += extra_cost->ldst.storev;
6956               else if (GET_MODE_CLASS (mode) == MODE_INT)
6957                 *cost += extra_cost->ldst.store;
6958               else if (mode == SFmode)
6959                 *cost += extra_cost->ldst.storef;
6960               else if (mode == DFmode)
6961                 *cost += extra_cost->ldst.stored;
6962
6963               *cost +=
6964                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6965                                                      0, speed));
6966             }
6967
6968           *cost += rtx_cost (op1, mode, SET, 1, speed);
6969           return true;
6970
6971         case SUBREG:
6972           if (! REG_P (SUBREG_REG (op0)))
6973             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6974
6975           /* Fall through.  */
6976         case REG:
6977           /* The cost is one per vector-register copied.  */
6978           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6979             {
6980               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
6981               *cost = COSTS_N_INSNS (nregs);
6982             }
6983           /* const0_rtx is in general free, but we will use an
6984              instruction to set a register to 0.  */
6985           else if (REG_P (op1) || op1 == const0_rtx)
6986             {
6987               /* The cost is 1 per register copied.  */
6988               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
6989               *cost = COSTS_N_INSNS (nregs);
6990             }
6991           else
6992             /* Cost is just the cost of the RHS of the set.  */
6993             *cost += rtx_cost (op1, mode, SET, 1, speed);
6994           return true;
6995
6996         case ZERO_EXTRACT:
6997         case SIGN_EXTRACT:
6998           /* Bit-field insertion.  Strip any redundant widening of
6999              the RHS to meet the width of the target.  */
7000           if (GET_CODE (op1) == SUBREG)
7001             op1 = SUBREG_REG (op1);
7002           if ((GET_CODE (op1) == ZERO_EXTEND
7003                || GET_CODE (op1) == SIGN_EXTEND)
7004               && CONST_INT_P (XEXP (op0, 1))
7005               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
7006               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
7007             op1 = XEXP (op1, 0);
7008
7009           if (CONST_INT_P (op1))
7010             {
7011               /* MOV immediate is assumed to always be cheap.  */
7012               *cost = COSTS_N_INSNS (1);
7013             }
7014           else
7015             {
7016               /* BFM.  */
7017               if (speed)
7018                 *cost += extra_cost->alu.bfi;
7019               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
7020             }
7021
7022           return true;
7023
7024         default:
7025           /* We can't make sense of this, assume default cost.  */
7026           *cost = COSTS_N_INSNS (1);
7027           return false;
7028         }
7029       return false;
7030
7031     case CONST_INT:
7032       /* If an instruction can incorporate a constant within the
7033          instruction, the instruction's expression avoids calling
7034          rtx_cost() on the constant.  If rtx_cost() is called on a
7035          constant, then it is usually because the constant must be
7036          moved into a register by one or more instructions.
7037
7038          The exception is constant 0, which can be expressed
7039          as XZR/WZR and is therefore free.  The exception to this is
7040          if we have (set (reg) (const0_rtx)) in which case we must cost
7041          the move.  However, we can catch that when we cost the SET, so
7042          we don't need to consider that here.  */
7043       if (x == const0_rtx)
7044         *cost = 0;
7045       else
7046         {
7047           /* To an approximation, building any other constant is
7048              proportionally expensive to the number of instructions
7049              required to build that constant.  This is true whether we
7050              are compiling for SPEED or otherwise.  */
7051           if (!is_a <scalar_int_mode> (mode, &int_mode))
7052             int_mode = word_mode;
7053           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7054                                  (NULL_RTX, x, false, int_mode));
7055         }
7056       return true;
7057
7058     case CONST_DOUBLE:
7059
7060       /* First determine number of instructions to do the move
7061           as an integer constant.  */
7062       if (!aarch64_float_const_representable_p (x)
7063            && !aarch64_can_const_movi_rtx_p (x, mode)
7064            && aarch64_float_const_rtx_p (x))
7065         {
7066           unsigned HOST_WIDE_INT ival;
7067           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7068           gcc_assert (succeed);
7069
7070           scalar_int_mode imode = (mode == HFmode
7071                                    ? SImode
7072                                    : int_mode_for_mode (mode).require ());
7073           int ncost = aarch64_internal_mov_immediate
7074                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7075           *cost += COSTS_N_INSNS (ncost);
7076           return true;
7077         }
7078
7079       if (speed)
7080         {
7081           /* mov[df,sf]_aarch64.  */
7082           if (aarch64_float_const_representable_p (x))
7083             /* FMOV (scalar immediate).  */
7084             *cost += extra_cost->fp[mode == DFmode].fpconst;
7085           else if (!aarch64_float_const_zero_rtx_p (x))
7086             {
7087               /* This will be a load from memory.  */
7088               if (mode == DFmode)
7089                 *cost += extra_cost->ldst.loadd;
7090               else
7091                 *cost += extra_cost->ldst.loadf;
7092             }
7093           else
7094             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7095                or MOV v0.s[0], wzr - neither of which are modeled by the
7096                cost tables.  Just use the default cost.  */
7097             {
7098             }
7099         }
7100
7101       return true;
7102
7103     case MEM:
7104       if (speed)
7105         {
7106           /* For loads we want the base cost of a load, plus an
7107              approximation for the additional cost of the addressing
7108              mode.  */
7109           rtx address = XEXP (x, 0);
7110           if (VECTOR_MODE_P (mode))
7111             *cost += extra_cost->ldst.loadv;
7112           else if (GET_MODE_CLASS (mode) == MODE_INT)
7113             *cost += extra_cost->ldst.load;
7114           else if (mode == SFmode)
7115             *cost += extra_cost->ldst.loadf;
7116           else if (mode == DFmode)
7117             *cost += extra_cost->ldst.loadd;
7118
7119           *cost +=
7120                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7121                                                      0, speed));
7122         }
7123
7124       return true;
7125
7126     case NEG:
7127       op0 = XEXP (x, 0);
7128
7129       if (VECTOR_MODE_P (mode))
7130         {
7131           if (speed)
7132             {
7133               /* FNEG.  */
7134               *cost += extra_cost->vect.alu;
7135             }
7136           return false;
7137         }
7138
7139       if (GET_MODE_CLASS (mode) == MODE_INT)
7140         {
7141           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7142               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7143             {
7144               /* CSETM.  */
7145               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7146               return true;
7147             }
7148
7149           /* Cost this as SUB wzr, X.  */
7150           op0 = CONST0_RTX (mode);
7151           op1 = XEXP (x, 0);
7152           goto cost_minus;
7153         }
7154
7155       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7156         {
7157           /* Support (neg(fma...)) as a single instruction only if
7158              sign of zeros is unimportant.  This matches the decision
7159              making in aarch64.md.  */
7160           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7161             {
7162               /* FNMADD.  */
7163               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7164               return true;
7165             }
7166           if (GET_CODE (op0) == MULT)
7167             {
7168               /* FNMUL.  */
7169               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7170               return true;
7171             }
7172           if (speed)
7173             /* FNEG.  */
7174             *cost += extra_cost->fp[mode == DFmode].neg;
7175           return false;
7176         }
7177
7178       return false;
7179
7180     case CLRSB:
7181     case CLZ:
7182       if (speed)
7183         {
7184           if (VECTOR_MODE_P (mode))
7185             *cost += extra_cost->vect.alu;
7186           else
7187             *cost += extra_cost->alu.clz;
7188         }
7189
7190       return false;
7191
7192     case COMPARE:
7193       op0 = XEXP (x, 0);
7194       op1 = XEXP (x, 1);
7195
7196       if (op1 == const0_rtx
7197           && GET_CODE (op0) == AND)
7198         {
7199           x = op0;
7200           mode = GET_MODE (op0);
7201           goto cost_logic;
7202         }
7203
7204       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7205         {
7206           /* TODO: A write to the CC flags possibly costs extra, this
7207              needs encoding in the cost tables.  */
7208
7209           mode = GET_MODE (op0);
7210           /* ANDS.  */
7211           if (GET_CODE (op0) == AND)
7212             {
7213               x = op0;
7214               goto cost_logic;
7215             }
7216
7217           if (GET_CODE (op0) == PLUS)
7218             {
7219               /* ADDS (and CMN alias).  */
7220               x = op0;
7221               goto cost_plus;
7222             }
7223
7224           if (GET_CODE (op0) == MINUS)
7225             {
7226               /* SUBS.  */
7227               x = op0;
7228               goto cost_minus;
7229             }
7230
7231           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7232               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7233               && CONST_INT_P (XEXP (op0, 2)))
7234             {
7235               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7236                  Handle it here directly rather than going to cost_logic
7237                  since we know the immediate generated for the TST is valid
7238                  so we can avoid creating an intermediate rtx for it only
7239                  for costing purposes.  */
7240               if (speed)
7241                 *cost += extra_cost->alu.logical;
7242
7243               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7244                                  ZERO_EXTRACT, 0, speed);
7245               return true;
7246             }
7247
7248           if (GET_CODE (op1) == NEG)
7249             {
7250               /* CMN.  */
7251               if (speed)
7252                 *cost += extra_cost->alu.arith;
7253
7254               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7255               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7256               return true;
7257             }
7258
7259           /* CMP.
7260
7261              Compare can freely swap the order of operands, and
7262              canonicalization puts the more complex operation first.
7263              But the integer MINUS logic expects the shift/extend
7264              operation in op1.  */
7265           if (! (REG_P (op0)
7266                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7267           {
7268             op0 = XEXP (x, 1);
7269             op1 = XEXP (x, 0);
7270           }
7271           goto cost_minus;
7272         }
7273
7274       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7275         {
7276           /* FCMP.  */
7277           if (speed)
7278             *cost += extra_cost->fp[mode == DFmode].compare;
7279
7280           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7281             {
7282               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7283               /* FCMP supports constant 0.0 for no extra cost. */
7284               return true;
7285             }
7286           return false;
7287         }
7288
7289       if (VECTOR_MODE_P (mode))
7290         {
7291           /* Vector compare.  */
7292           if (speed)
7293             *cost += extra_cost->vect.alu;
7294
7295           if (aarch64_float_const_zero_rtx_p (op1))
7296             {
7297               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7298                  cost.  */
7299               return true;
7300             }
7301           return false;
7302         }
7303       return false;
7304
7305     case MINUS:
7306       {
7307         op0 = XEXP (x, 0);
7308         op1 = XEXP (x, 1);
7309
7310 cost_minus:
7311         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7312
7313         /* Detect valid immediates.  */
7314         if ((GET_MODE_CLASS (mode) == MODE_INT
7315              || (GET_MODE_CLASS (mode) == MODE_CC
7316                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7317             && CONST_INT_P (op1)
7318             && aarch64_uimm12_shift (INTVAL (op1)))
7319           {
7320             if (speed)
7321               /* SUB(S) (immediate).  */
7322               *cost += extra_cost->alu.arith;
7323             return true;
7324           }
7325
7326         /* Look for SUB (extended register).  */
7327         if (is_a <scalar_int_mode> (mode, &int_mode)
7328             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7329           {
7330             if (speed)
7331               *cost += extra_cost->alu.extend_arith;
7332
7333             op1 = aarch64_strip_extend (op1, true);
7334             *cost += rtx_cost (op1, VOIDmode,
7335                                (enum rtx_code) GET_CODE (op1), 0, speed);
7336             return true;
7337           }
7338
7339         rtx new_op1 = aarch64_strip_extend (op1, false);
7340
7341         /* Cost this as an FMA-alike operation.  */
7342         if ((GET_CODE (new_op1) == MULT
7343              || aarch64_shift_p (GET_CODE (new_op1)))
7344             && code != COMPARE)
7345           {
7346             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7347                                             (enum rtx_code) code,
7348                                             speed);
7349             return true;
7350           }
7351
7352         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7353
7354         if (speed)
7355           {
7356             if (VECTOR_MODE_P (mode))
7357               {
7358                 /* Vector SUB.  */
7359                 *cost += extra_cost->vect.alu;
7360               }
7361             else if (GET_MODE_CLASS (mode) == MODE_INT)
7362               {
7363                 /* SUB(S).  */
7364                 *cost += extra_cost->alu.arith;
7365               }
7366             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7367               {
7368                 /* FSUB.  */
7369                 *cost += extra_cost->fp[mode == DFmode].addsub;
7370               }
7371           }
7372         return true;
7373       }
7374
7375     case PLUS:
7376       {
7377         rtx new_op0;
7378
7379         op0 = XEXP (x, 0);
7380         op1 = XEXP (x, 1);
7381
7382 cost_plus:
7383         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7384             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7385           {
7386             /* CSINC.  */
7387             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7388             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7389             return true;
7390           }
7391
7392         if (GET_MODE_CLASS (mode) == MODE_INT
7393             && CONST_INT_P (op1)
7394             && aarch64_uimm12_shift (INTVAL (op1)))
7395           {
7396             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7397
7398             if (speed)
7399               /* ADD (immediate).  */
7400               *cost += extra_cost->alu.arith;
7401             return true;
7402           }
7403
7404         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7405
7406         /* Look for ADD (extended register).  */
7407         if (is_a <scalar_int_mode> (mode, &int_mode)
7408             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7409           {
7410             if (speed)
7411               *cost += extra_cost->alu.extend_arith;
7412
7413             op0 = aarch64_strip_extend (op0, true);
7414             *cost += rtx_cost (op0, VOIDmode,
7415                                (enum rtx_code) GET_CODE (op0), 0, speed);
7416             return true;
7417           }
7418
7419         /* Strip any extend, leave shifts behind as we will
7420            cost them through mult_cost.  */
7421         new_op0 = aarch64_strip_extend (op0, false);
7422
7423         if (GET_CODE (new_op0) == MULT
7424             || aarch64_shift_p (GET_CODE (new_op0)))
7425           {
7426             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7427                                             speed);
7428             return true;
7429           }
7430
7431         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7432
7433         if (speed)
7434           {
7435             if (VECTOR_MODE_P (mode))
7436               {
7437                 /* Vector ADD.  */
7438                 *cost += extra_cost->vect.alu;
7439               }
7440             else if (GET_MODE_CLASS (mode) == MODE_INT)
7441               {
7442                 /* ADD.  */
7443                 *cost += extra_cost->alu.arith;
7444               }
7445             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7446               {
7447                 /* FADD.  */
7448                 *cost += extra_cost->fp[mode == DFmode].addsub;
7449               }
7450           }
7451         return true;
7452       }
7453
7454     case BSWAP:
7455       *cost = COSTS_N_INSNS (1);
7456
7457       if (speed)
7458         {
7459           if (VECTOR_MODE_P (mode))
7460             *cost += extra_cost->vect.alu;
7461           else
7462             *cost += extra_cost->alu.rev;
7463         }
7464       return false;
7465
7466     case IOR:
7467       if (aarch_rev16_p (x))
7468         {
7469           *cost = COSTS_N_INSNS (1);
7470
7471           if (speed)
7472             {
7473               if (VECTOR_MODE_P (mode))
7474                 *cost += extra_cost->vect.alu;
7475               else
7476                 *cost += extra_cost->alu.rev;
7477             }
7478           return true;
7479         }
7480
7481       if (aarch64_extr_rtx_p (x, &op0, &op1))
7482         {
7483           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7484           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7485           if (speed)
7486             *cost += extra_cost->alu.shift;
7487
7488           return true;
7489         }
7490     /* Fall through.  */
7491     case XOR:
7492     case AND:
7493     cost_logic:
7494       op0 = XEXP (x, 0);
7495       op1 = XEXP (x, 1);
7496
7497       if (VECTOR_MODE_P (mode))
7498         {
7499           if (speed)
7500             *cost += extra_cost->vect.alu;
7501           return true;
7502         }
7503
7504       if (code == AND
7505           && GET_CODE (op0) == MULT
7506           && CONST_INT_P (XEXP (op0, 1))
7507           && CONST_INT_P (op1)
7508           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7509                                INTVAL (op1)) != 0)
7510         {
7511           /* This is a UBFM/SBFM.  */
7512           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7513           if (speed)
7514             *cost += extra_cost->alu.bfx;
7515           return true;
7516         }
7517
7518       if (is_int_mode (mode, &int_mode))
7519         {
7520           if (CONST_INT_P (op1))
7521             {
7522               /* We have a mask + shift version of a UBFIZ
7523                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7524               if (GET_CODE (op0) == ASHIFT
7525                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7526                                                          XEXP (op0, 1)))
7527                 {
7528                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7529                                      (enum rtx_code) code, 0, speed);
7530                   if (speed)
7531                     *cost += extra_cost->alu.bfx;
7532
7533                   return true;
7534                 }
7535               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7536                 {
7537                 /* We possibly get the immediate for free, this is not
7538                    modelled.  */
7539                   *cost += rtx_cost (op0, int_mode,
7540                                      (enum rtx_code) code, 0, speed);
7541                   if (speed)
7542                     *cost += extra_cost->alu.logical;
7543
7544                   return true;
7545                 }
7546             }
7547           else
7548             {
7549               rtx new_op0 = op0;
7550
7551               /* Handle ORN, EON, or BIC.  */
7552               if (GET_CODE (op0) == NOT)
7553                 op0 = XEXP (op0, 0);
7554
7555               new_op0 = aarch64_strip_shift (op0);
7556
7557               /* If we had a shift on op0 then this is a logical-shift-
7558                  by-register/immediate operation.  Otherwise, this is just
7559                  a logical operation.  */
7560               if (speed)
7561                 {
7562                   if (new_op0 != op0)
7563                     {
7564                       /* Shift by immediate.  */
7565                       if (CONST_INT_P (XEXP (op0, 1)))
7566                         *cost += extra_cost->alu.log_shift;
7567                       else
7568                         *cost += extra_cost->alu.log_shift_reg;
7569                     }
7570                   else
7571                     *cost += extra_cost->alu.logical;
7572                 }
7573
7574               /* In both cases we want to cost both operands.  */
7575               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7576                                  0, speed);
7577               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7578                                  1, speed);
7579
7580               return true;
7581             }
7582         }
7583       return false;
7584
7585     case NOT:
7586       x = XEXP (x, 0);
7587       op0 = aarch64_strip_shift (x);
7588
7589       if (VECTOR_MODE_P (mode))
7590         {
7591           /* Vector NOT.  */
7592           *cost += extra_cost->vect.alu;
7593           return false;
7594         }
7595
7596       /* MVN-shifted-reg.  */
7597       if (op0 != x)
7598         {
7599           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7600
7601           if (speed)
7602             *cost += extra_cost->alu.log_shift;
7603
7604           return true;
7605         }
7606       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7607          Handle the second form here taking care that 'a' in the above can
7608          be a shift.  */
7609       else if (GET_CODE (op0) == XOR)
7610         {
7611           rtx newop0 = XEXP (op0, 0);
7612           rtx newop1 = XEXP (op0, 1);
7613           rtx op0_stripped = aarch64_strip_shift (newop0);
7614
7615           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7616           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7617
7618           if (speed)
7619             {
7620               if (op0_stripped != newop0)
7621                 *cost += extra_cost->alu.log_shift;
7622               else
7623                 *cost += extra_cost->alu.logical;
7624             }
7625
7626           return true;
7627         }
7628       /* MVN.  */
7629       if (speed)
7630         *cost += extra_cost->alu.logical;
7631
7632       return false;
7633
7634     case ZERO_EXTEND:
7635
7636       op0 = XEXP (x, 0);
7637       /* If a value is written in SI mode, then zero extended to DI
7638          mode, the operation will in general be free as a write to
7639          a 'w' register implicitly zeroes the upper bits of an 'x'
7640          register.  However, if this is
7641
7642            (set (reg) (zero_extend (reg)))
7643
7644          we must cost the explicit register move.  */
7645       if (mode == DImode
7646           && GET_MODE (op0) == SImode
7647           && outer == SET)
7648         {
7649           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7650
7651         /* If OP_COST is non-zero, then the cost of the zero extend
7652            is effectively the cost of the inner operation.  Otherwise
7653            we have a MOV instruction and we take the cost from the MOV
7654            itself.  This is true independently of whether we are
7655            optimizing for space or time.  */
7656           if (op_cost)
7657             *cost = op_cost;
7658
7659           return true;
7660         }
7661       else if (MEM_P (op0))
7662         {
7663           /* All loads can zero extend to any size for free.  */
7664           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7665           return true;
7666         }
7667
7668       op0 = aarch64_extend_bitfield_pattern_p (x);
7669       if (op0)
7670         {
7671           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7672           if (speed)
7673             *cost += extra_cost->alu.bfx;
7674           return true;
7675         }
7676
7677       if (speed)
7678         {
7679           if (VECTOR_MODE_P (mode))
7680             {
7681               /* UMOV.  */
7682               *cost += extra_cost->vect.alu;
7683             }
7684           else
7685             {
7686               /* We generate an AND instead of UXTB/UXTH.  */
7687               *cost += extra_cost->alu.logical;
7688             }
7689         }
7690       return false;
7691
7692     case SIGN_EXTEND:
7693       if (MEM_P (XEXP (x, 0)))
7694         {
7695           /* LDRSH.  */
7696           if (speed)
7697             {
7698               rtx address = XEXP (XEXP (x, 0), 0);
7699               *cost += extra_cost->ldst.load_sign_extend;
7700
7701               *cost +=
7702                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7703                                                      0, speed));
7704             }
7705           return true;
7706         }
7707
7708       op0 = aarch64_extend_bitfield_pattern_p (x);
7709       if (op0)
7710         {
7711           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7712           if (speed)
7713             *cost += extra_cost->alu.bfx;
7714           return true;
7715         }
7716
7717       if (speed)
7718         {
7719           if (VECTOR_MODE_P (mode))
7720             *cost += extra_cost->vect.alu;
7721           else
7722             *cost += extra_cost->alu.extend;
7723         }
7724       return false;
7725
7726     case ASHIFT:
7727       op0 = XEXP (x, 0);
7728       op1 = XEXP (x, 1);
7729
7730       if (CONST_INT_P (op1))
7731         {
7732           if (speed)
7733             {
7734               if (VECTOR_MODE_P (mode))
7735                 {
7736                   /* Vector shift (immediate).  */
7737                   *cost += extra_cost->vect.alu;
7738                 }
7739               else
7740                 {
7741                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7742                      aliases.  */
7743                   *cost += extra_cost->alu.shift;
7744                 }
7745             }
7746
7747           /* We can incorporate zero/sign extend for free.  */
7748           if (GET_CODE (op0) == ZERO_EXTEND
7749               || GET_CODE (op0) == SIGN_EXTEND)
7750             op0 = XEXP (op0, 0);
7751
7752           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7753           return true;
7754         }
7755       else
7756         {
7757           if (VECTOR_MODE_P (mode))
7758             {
7759               if (speed)
7760                 /* Vector shift (register).  */
7761                 *cost += extra_cost->vect.alu;
7762             }
7763           else
7764             {
7765               if (speed)
7766                 /* LSLV.  */
7767                 *cost += extra_cost->alu.shift_reg;
7768
7769               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7770                   && CONST_INT_P (XEXP (op1, 1))
7771                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7772                 {
7773                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7774                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7775                      don't recurse into it.  */
7776                   return true;
7777                 }
7778             }
7779           return false;  /* All arguments need to be in registers.  */
7780         }
7781
7782     case ROTATE:
7783     case ROTATERT:
7784     case LSHIFTRT:
7785     case ASHIFTRT:
7786       op0 = XEXP (x, 0);
7787       op1 = XEXP (x, 1);
7788
7789       if (CONST_INT_P (op1))
7790         {
7791           /* ASR (immediate) and friends.  */
7792           if (speed)
7793             {
7794               if (VECTOR_MODE_P (mode))
7795                 *cost += extra_cost->vect.alu;
7796               else
7797                 *cost += extra_cost->alu.shift;
7798             }
7799
7800           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7801           return true;
7802         }
7803       else
7804         {
7805           if (VECTOR_MODE_P (mode))
7806             {
7807               if (speed)
7808                 /* Vector shift (register).  */
7809                 *cost += extra_cost->vect.alu;
7810             }
7811           else
7812             {
7813               if (speed)
7814                 /* ASR (register) and friends.  */
7815                 *cost += extra_cost->alu.shift_reg;
7816
7817               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7818                   && CONST_INT_P (XEXP (op1, 1))
7819                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7820                 {
7821                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7822                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7823                      don't recurse into it.  */
7824                   return true;
7825                 }
7826             }
7827           return false;  /* All arguments need to be in registers.  */
7828         }
7829
7830     case SYMBOL_REF:
7831
7832       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7833           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7834         {
7835           /* LDR.  */
7836           if (speed)
7837             *cost += extra_cost->ldst.load;
7838         }
7839       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7840                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7841         {
7842           /* ADRP, followed by ADD.  */
7843           *cost += COSTS_N_INSNS (1);
7844           if (speed)
7845             *cost += 2 * extra_cost->alu.arith;
7846         }
7847       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7848                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7849         {
7850           /* ADR.  */
7851           if (speed)
7852             *cost += extra_cost->alu.arith;
7853         }
7854
7855       if (flag_pic)
7856         {
7857           /* One extra load instruction, after accessing the GOT.  */
7858           *cost += COSTS_N_INSNS (1);
7859           if (speed)
7860             *cost += extra_cost->ldst.load;
7861         }
7862       return true;
7863
7864     case HIGH:
7865     case LO_SUM:
7866       /* ADRP/ADD (immediate).  */
7867       if (speed)
7868         *cost += extra_cost->alu.arith;
7869       return true;
7870
7871     case ZERO_EXTRACT:
7872     case SIGN_EXTRACT:
7873       /* UBFX/SBFX.  */
7874       if (speed)
7875         {
7876           if (VECTOR_MODE_P (mode))
7877             *cost += extra_cost->vect.alu;
7878           else
7879             *cost += extra_cost->alu.bfx;
7880         }
7881
7882       /* We can trust that the immediates used will be correct (there
7883          are no by-register forms), so we need only cost op0.  */
7884       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7885       return true;
7886
7887     case MULT:
7888       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7889       /* aarch64_rtx_mult_cost always handles recursion to its
7890          operands.  */
7891       return true;
7892
7893     case MOD:
7894     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7895        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7896        an unconditional negate.  This case should only ever be reached through
7897        the set_smod_pow2_cheap check in expmed.c.  */
7898       if (CONST_INT_P (XEXP (x, 1))
7899           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7900           && (mode == SImode || mode == DImode))
7901         {
7902           /* We expand to 4 instructions.  Reset the baseline.  */
7903           *cost = COSTS_N_INSNS (4);
7904
7905           if (speed)
7906             *cost += 2 * extra_cost->alu.logical
7907                      + 2 * extra_cost->alu.arith;
7908
7909           return true;
7910         }
7911
7912     /* Fall-through.  */
7913     case UMOD:
7914       if (speed)
7915         {
7916           /* Slighly prefer UMOD over SMOD.  */
7917           if (VECTOR_MODE_P (mode))
7918             *cost += extra_cost->vect.alu;
7919           else if (GET_MODE_CLASS (mode) == MODE_INT)
7920             *cost += (extra_cost->mult[mode == DImode].add
7921                       + extra_cost->mult[mode == DImode].idiv
7922                       + (code == MOD ? 1 : 0));
7923         }
7924       return false;  /* All arguments need to be in registers.  */
7925
7926     case DIV:
7927     case UDIV:
7928     case SQRT:
7929       if (speed)
7930         {
7931           if (VECTOR_MODE_P (mode))
7932             *cost += extra_cost->vect.alu;
7933           else if (GET_MODE_CLASS (mode) == MODE_INT)
7934             /* There is no integer SQRT, so only DIV and UDIV can get
7935                here.  */
7936             *cost += (extra_cost->mult[mode == DImode].idiv
7937                      /* Slighly prefer UDIV over SDIV.  */
7938                      + (code == DIV ? 1 : 0));
7939           else
7940             *cost += extra_cost->fp[mode == DFmode].div;
7941         }
7942       return false;  /* All arguments need to be in registers.  */
7943
7944     case IF_THEN_ELSE:
7945       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7946                                          XEXP (x, 2), cost, speed);
7947
7948     case EQ:
7949     case NE:
7950     case GT:
7951     case GTU:
7952     case LT:
7953     case LTU:
7954     case GE:
7955     case GEU:
7956     case LE:
7957     case LEU:
7958
7959       return false; /* All arguments must be in registers.  */
7960
7961     case FMA:
7962       op0 = XEXP (x, 0);
7963       op1 = XEXP (x, 1);
7964       op2 = XEXP (x, 2);
7965
7966       if (speed)
7967         {
7968           if (VECTOR_MODE_P (mode))
7969             *cost += extra_cost->vect.alu;
7970           else
7971             *cost += extra_cost->fp[mode == DFmode].fma;
7972         }
7973
7974       /* FMSUB, FNMADD, and FNMSUB are free.  */
7975       if (GET_CODE (op0) == NEG)
7976         op0 = XEXP (op0, 0);
7977
7978       if (GET_CODE (op2) == NEG)
7979         op2 = XEXP (op2, 0);
7980
7981       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7982          and the by-element operand as operand 0.  */
7983       if (GET_CODE (op1) == NEG)
7984         op1 = XEXP (op1, 0);
7985
7986       /* Catch vector-by-element operations.  The by-element operand can
7987          either be (vec_duplicate (vec_select (x))) or just
7988          (vec_select (x)), depending on whether we are multiplying by
7989          a vector or a scalar.
7990
7991          Canonicalization is not very good in these cases, FMA4 will put the
7992          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7993       if (GET_CODE (op0) == VEC_DUPLICATE)
7994         op0 = XEXP (op0, 0);
7995       else if (GET_CODE (op1) == VEC_DUPLICATE)
7996         op1 = XEXP (op1, 0);
7997
7998       if (GET_CODE (op0) == VEC_SELECT)
7999         op0 = XEXP (op0, 0);
8000       else if (GET_CODE (op1) == VEC_SELECT)
8001         op1 = XEXP (op1, 0);
8002
8003       /* If the remaining parameters are not registers,
8004          get the cost to put them into registers.  */
8005       *cost += rtx_cost (op0, mode, FMA, 0, speed);
8006       *cost += rtx_cost (op1, mode, FMA, 1, speed);
8007       *cost += rtx_cost (op2, mode, FMA, 2, speed);
8008       return true;
8009
8010     case FLOAT:
8011     case UNSIGNED_FLOAT:
8012       if (speed)
8013         *cost += extra_cost->fp[mode == DFmode].fromint;
8014       return false;
8015
8016     case FLOAT_EXTEND:
8017       if (speed)
8018         {
8019           if (VECTOR_MODE_P (mode))
8020             {
8021               /*Vector truncate.  */
8022               *cost += extra_cost->vect.alu;
8023             }
8024           else
8025             *cost += extra_cost->fp[mode == DFmode].widen;
8026         }
8027       return false;
8028
8029     case FLOAT_TRUNCATE:
8030       if (speed)
8031         {
8032           if (VECTOR_MODE_P (mode))
8033             {
8034               /*Vector conversion.  */
8035               *cost += extra_cost->vect.alu;
8036             }
8037           else
8038             *cost += extra_cost->fp[mode == DFmode].narrow;
8039         }
8040       return false;
8041
8042     case FIX:
8043     case UNSIGNED_FIX:
8044       x = XEXP (x, 0);
8045       /* Strip the rounding part.  They will all be implemented
8046          by the fcvt* family of instructions anyway.  */
8047       if (GET_CODE (x) == UNSPEC)
8048         {
8049           unsigned int uns_code = XINT (x, 1);
8050
8051           if (uns_code == UNSPEC_FRINTA
8052               || uns_code == UNSPEC_FRINTM
8053               || uns_code == UNSPEC_FRINTN
8054               || uns_code == UNSPEC_FRINTP
8055               || uns_code == UNSPEC_FRINTZ)
8056             x = XVECEXP (x, 0, 0);
8057         }
8058
8059       if (speed)
8060         {
8061           if (VECTOR_MODE_P (mode))
8062             *cost += extra_cost->vect.alu;
8063           else
8064             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8065         }
8066
8067       /* We can combine fmul by a power of 2 followed by a fcvt into a single
8068          fixed-point fcvt.  */
8069       if (GET_CODE (x) == MULT
8070           && ((VECTOR_MODE_P (mode)
8071                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8072               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8073         {
8074           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8075                              0, speed);
8076           return true;
8077         }
8078
8079       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8080       return true;
8081
8082     case ABS:
8083       if (VECTOR_MODE_P (mode))
8084         {
8085           /* ABS (vector).  */
8086           if (speed)
8087             *cost += extra_cost->vect.alu;
8088         }
8089       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8090         {
8091           op0 = XEXP (x, 0);
8092
8093           /* FABD, which is analogous to FADD.  */
8094           if (GET_CODE (op0) == MINUS)
8095             {
8096               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8097               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8098               if (speed)
8099                 *cost += extra_cost->fp[mode == DFmode].addsub;
8100
8101               return true;
8102             }
8103           /* Simple FABS is analogous to FNEG.  */
8104           if (speed)
8105             *cost += extra_cost->fp[mode == DFmode].neg;
8106         }
8107       else
8108         {
8109           /* Integer ABS will either be split to
8110              two arithmetic instructions, or will be an ABS
8111              (scalar), which we don't model.  */
8112           *cost = COSTS_N_INSNS (2);
8113           if (speed)
8114             *cost += 2 * extra_cost->alu.arith;
8115         }
8116       return false;
8117
8118     case SMAX:
8119     case SMIN:
8120       if (speed)
8121         {
8122           if (VECTOR_MODE_P (mode))
8123             *cost += extra_cost->vect.alu;
8124           else
8125             {
8126               /* FMAXNM/FMINNM/FMAX/FMIN.
8127                  TODO: This may not be accurate for all implementations, but
8128                  we do not model this in the cost tables.  */
8129               *cost += extra_cost->fp[mode == DFmode].addsub;
8130             }
8131         }
8132       return false;
8133
8134     case UNSPEC:
8135       /* The floating point round to integer frint* instructions.  */
8136       if (aarch64_frint_unspec_p (XINT (x, 1)))
8137         {
8138           if (speed)
8139             *cost += extra_cost->fp[mode == DFmode].roundint;
8140
8141           return false;
8142         }
8143
8144       if (XINT (x, 1) == UNSPEC_RBIT)
8145         {
8146           if (speed)
8147             *cost += extra_cost->alu.rev;
8148
8149           return false;
8150         }
8151       break;
8152
8153     case TRUNCATE:
8154
8155       /* Decompose <su>muldi3_highpart.  */
8156       if (/* (truncate:DI  */
8157           mode == DImode
8158           /*   (lshiftrt:TI  */
8159           && GET_MODE (XEXP (x, 0)) == TImode
8160           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8161           /*      (mult:TI  */
8162           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8163           /*        (ANY_EXTEND:TI (reg:DI))
8164                     (ANY_EXTEND:TI (reg:DI)))  */
8165           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8166                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8167               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8168                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8169           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8170           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8171           /*     (const_int 64)  */
8172           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8173           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8174         {
8175           /* UMULH/SMULH.  */
8176           if (speed)
8177             *cost += extra_cost->mult[mode == DImode].extend;
8178           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8179                              mode, MULT, 0, speed);
8180           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8181                              mode, MULT, 1, speed);
8182           return true;
8183         }
8184
8185       /* Fall through.  */
8186     default:
8187       break;
8188     }
8189
8190   if (dump_file
8191       && flag_aarch64_verbose_cost)
8192     fprintf (dump_file,
8193       "\nFailed to cost RTX.  Assuming default cost.\n");
8194
8195   return true;
8196 }
8197
8198 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8199    calculated for X.  This cost is stored in *COST.  Returns true
8200    if the total cost of X was calculated.  */
8201 static bool
8202 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8203                    int param, int *cost, bool speed)
8204 {
8205   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8206
8207   if (dump_file
8208       && flag_aarch64_verbose_cost)
8209     {
8210       print_rtl_single (dump_file, x);
8211       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8212                speed ? "Hot" : "Cold",
8213                *cost, result ? "final" : "partial");
8214     }
8215
8216   return result;
8217 }
8218
8219 static int
8220 aarch64_register_move_cost (machine_mode mode,
8221                             reg_class_t from_i, reg_class_t to_i)
8222 {
8223   enum reg_class from = (enum reg_class) from_i;
8224   enum reg_class to = (enum reg_class) to_i;
8225   const struct cpu_regmove_cost *regmove_cost
8226     = aarch64_tune_params.regmove_cost;
8227
8228   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8229   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8230     to = GENERAL_REGS;
8231
8232   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8233     from = GENERAL_REGS;
8234
8235   /* Moving between GPR and stack cost is the same as GP2GP.  */
8236   if ((from == GENERAL_REGS && to == STACK_REG)
8237       || (to == GENERAL_REGS && from == STACK_REG))
8238     return regmove_cost->GP2GP;
8239
8240   /* To/From the stack register, we move via the gprs.  */
8241   if (to == STACK_REG || from == STACK_REG)
8242     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8243             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8244
8245   if (GET_MODE_SIZE (mode) == 16)
8246     {
8247       /* 128-bit operations on general registers require 2 instructions.  */
8248       if (from == GENERAL_REGS && to == GENERAL_REGS)
8249         return regmove_cost->GP2GP * 2;
8250       else if (from == GENERAL_REGS)
8251         return regmove_cost->GP2FP * 2;
8252       else if (to == GENERAL_REGS)
8253         return regmove_cost->FP2GP * 2;
8254
8255       /* When AdvSIMD instructions are disabled it is not possible to move
8256          a 128-bit value directly between Q registers.  This is handled in
8257          secondary reload.  A general register is used as a scratch to move
8258          the upper DI value and the lower DI value is moved directly,
8259          hence the cost is the sum of three moves. */
8260       if (! TARGET_SIMD)
8261         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8262
8263       return regmove_cost->FP2FP;
8264     }
8265
8266   if (from == GENERAL_REGS && to == GENERAL_REGS)
8267     return regmove_cost->GP2GP;
8268   else if (from == GENERAL_REGS)
8269     return regmove_cost->GP2FP;
8270   else if (to == GENERAL_REGS)
8271     return regmove_cost->FP2GP;
8272
8273   return regmove_cost->FP2FP;
8274 }
8275
8276 static int
8277 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8278                           reg_class_t rclass ATTRIBUTE_UNUSED,
8279                           bool in ATTRIBUTE_UNUSED)
8280 {
8281   return aarch64_tune_params.memmov_cost;
8282 }
8283
8284 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8285    to optimize 1.0/sqrt.  */
8286
8287 static bool
8288 use_rsqrt_p (machine_mode mode)
8289 {
8290   return (!flag_trapping_math
8291           && flag_unsafe_math_optimizations
8292           && ((aarch64_tune_params.approx_modes->recip_sqrt
8293                & AARCH64_APPROX_MODE (mode))
8294               || flag_mrecip_low_precision_sqrt));
8295 }
8296
8297 /* Function to decide when to use the approximate reciprocal square root
8298    builtin.  */
8299
8300 static tree
8301 aarch64_builtin_reciprocal (tree fndecl)
8302 {
8303   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8304
8305   if (!use_rsqrt_p (mode))
8306     return NULL_TREE;
8307   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8308 }
8309
8310 typedef rtx (*rsqrte_type) (rtx, rtx);
8311
8312 /* Select reciprocal square root initial estimate insn depending on machine
8313    mode.  */
8314
8315 static rsqrte_type
8316 get_rsqrte_type (machine_mode mode)
8317 {
8318   switch (mode)
8319   {
8320     case E_DFmode:   return gen_aarch64_rsqrtedf;
8321     case E_SFmode:   return gen_aarch64_rsqrtesf;
8322     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8323     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8324     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8325     default: gcc_unreachable ();
8326   }
8327 }
8328
8329 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8330
8331 /* Select reciprocal square root series step insn depending on machine mode.  */
8332
8333 static rsqrts_type
8334 get_rsqrts_type (machine_mode mode)
8335 {
8336   switch (mode)
8337   {
8338     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8339     case E_SFmode:   return gen_aarch64_rsqrtssf;
8340     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8341     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8342     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8343     default: gcc_unreachable ();
8344   }
8345 }
8346
8347 /* Emit instruction sequence to compute either the approximate square root
8348    or its approximate reciprocal, depending on the flag RECP, and return
8349    whether the sequence was emitted or not.  */
8350
8351 bool
8352 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8353 {
8354   machine_mode mode = GET_MODE (dst);
8355
8356   if (GET_MODE_INNER (mode) == HFmode)
8357     {
8358       gcc_assert (!recp);
8359       return false;
8360     }
8361
8362   if (!recp)
8363     {
8364       if (!(flag_mlow_precision_sqrt
8365             || (aarch64_tune_params.approx_modes->sqrt
8366                 & AARCH64_APPROX_MODE (mode))))
8367         return false;
8368
8369       if (flag_finite_math_only
8370           || flag_trapping_math
8371           || !flag_unsafe_math_optimizations
8372           || optimize_function_for_size_p (cfun))
8373         return false;
8374     }
8375   else
8376     /* Caller assumes we cannot fail.  */
8377     gcc_assert (use_rsqrt_p (mode));
8378
8379   machine_mode mmsk = mode_for_int_vector (mode).require ();
8380   rtx xmsk = gen_reg_rtx (mmsk);
8381   if (!recp)
8382     /* When calculating the approximate square root, compare the
8383        argument with 0.0 and create a mask.  */
8384     emit_insn (gen_rtx_SET (xmsk,
8385                             gen_rtx_NEG (mmsk,
8386                                          gen_rtx_EQ (mmsk, src,
8387                                                      CONST0_RTX (mode)))));
8388
8389   /* Estimate the approximate reciprocal square root.  */
8390   rtx xdst = gen_reg_rtx (mode);
8391   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8392
8393   /* Iterate over the series twice for SF and thrice for DF.  */
8394   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8395
8396   /* Optionally iterate over the series once less for faster performance
8397      while sacrificing the accuracy.  */
8398   if ((recp && flag_mrecip_low_precision_sqrt)
8399       || (!recp && flag_mlow_precision_sqrt))
8400     iterations--;
8401
8402   /* Iterate over the series to calculate the approximate reciprocal square
8403      root.  */
8404   rtx x1 = gen_reg_rtx (mode);
8405   while (iterations--)
8406     {
8407       rtx x2 = gen_reg_rtx (mode);
8408       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8409
8410       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8411
8412       if (iterations > 0)
8413         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8414     }
8415
8416   if (!recp)
8417     {
8418       /* Qualify the approximate reciprocal square root when the argument is
8419          0.0 by squashing the intermediary result to 0.0.  */
8420       rtx xtmp = gen_reg_rtx (mmsk);
8421       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8422                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8423       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8424
8425       /* Calculate the approximate square root.  */
8426       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8427     }
8428
8429   /* Finalize the approximation.  */
8430   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8431
8432   return true;
8433 }
8434
8435 typedef rtx (*recpe_type) (rtx, rtx);
8436
8437 /* Select reciprocal initial estimate insn depending on machine mode.  */
8438
8439 static recpe_type
8440 get_recpe_type (machine_mode mode)
8441 {
8442   switch (mode)
8443   {
8444     case E_SFmode:   return (gen_aarch64_frecpesf);
8445     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8446     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8447     case E_DFmode:   return (gen_aarch64_frecpedf);
8448     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8449     default:         gcc_unreachable ();
8450   }
8451 }
8452
8453 typedef rtx (*recps_type) (rtx, rtx, rtx);
8454
8455 /* Select reciprocal series step insn depending on machine mode.  */
8456
8457 static recps_type
8458 get_recps_type (machine_mode mode)
8459 {
8460   switch (mode)
8461   {
8462     case E_SFmode:   return (gen_aarch64_frecpssf);
8463     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8464     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8465     case E_DFmode:   return (gen_aarch64_frecpsdf);
8466     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8467     default:         gcc_unreachable ();
8468   }
8469 }
8470
8471 /* Emit the instruction sequence to compute the approximation for the division
8472    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8473
8474 bool
8475 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8476 {
8477   machine_mode mode = GET_MODE (quo);
8478
8479   if (GET_MODE_INNER (mode) == HFmode)
8480     return false;
8481
8482   bool use_approx_division_p = (flag_mlow_precision_div
8483                                 || (aarch64_tune_params.approx_modes->division
8484                                     & AARCH64_APPROX_MODE (mode)));
8485
8486   if (!flag_finite_math_only
8487       || flag_trapping_math
8488       || !flag_unsafe_math_optimizations
8489       || optimize_function_for_size_p (cfun)
8490       || !use_approx_division_p)
8491     return false;
8492
8493   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
8494     return false;
8495
8496   /* Estimate the approximate reciprocal.  */
8497   rtx xrcp = gen_reg_rtx (mode);
8498   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8499
8500   /* Iterate over the series twice for SF and thrice for DF.  */
8501   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8502
8503   /* Optionally iterate over the series once less for faster performance,
8504      while sacrificing the accuracy.  */
8505   if (flag_mlow_precision_div)
8506     iterations--;
8507
8508   /* Iterate over the series to calculate the approximate reciprocal.  */
8509   rtx xtmp = gen_reg_rtx (mode);
8510   while (iterations--)
8511     {
8512       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8513
8514       if (iterations > 0)
8515         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8516     }
8517
8518   if (num != CONST1_RTX (mode))
8519     {
8520       /* As the approximate reciprocal of DEN is already calculated, only
8521          calculate the approximate division when NUM is not 1.0.  */
8522       rtx xnum = force_reg (mode, num);
8523       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8524     }
8525
8526   /* Finalize the approximation.  */
8527   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8528   return true;
8529 }
8530
8531 /* Return the number of instructions that can be issued per cycle.  */
8532 static int
8533 aarch64_sched_issue_rate (void)
8534 {
8535   return aarch64_tune_params.issue_rate;
8536 }
8537
8538 static int
8539 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8540 {
8541   int issue_rate = aarch64_sched_issue_rate ();
8542
8543   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8544 }
8545
8546
8547 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8548    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8549    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8550
8551 static int
8552 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8553                                                     int ready_index)
8554 {
8555   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8556 }
8557
8558
8559 /* Vectorizer cost model target hooks.  */
8560
8561 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8562 static int
8563 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8564                                     tree vectype,
8565                                     int misalign ATTRIBUTE_UNUSED)
8566 {
8567   unsigned elements;
8568   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8569   bool fp = false;
8570
8571   if (vectype != NULL)
8572     fp = FLOAT_TYPE_P (vectype);
8573
8574   switch (type_of_cost)
8575     {
8576       case scalar_stmt:
8577         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8578
8579       case scalar_load:
8580         return costs->scalar_load_cost;
8581
8582       case scalar_store:
8583         return costs->scalar_store_cost;
8584
8585       case vector_stmt:
8586         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8587
8588       case vector_load:
8589         return costs->vec_align_load_cost;
8590
8591       case vector_store:
8592         return costs->vec_store_cost;
8593
8594       case vec_to_scalar:
8595         return costs->vec_to_scalar_cost;
8596
8597       case scalar_to_vec:
8598         return costs->scalar_to_vec_cost;
8599
8600       case unaligned_load:
8601       case vector_gather_load:
8602         return costs->vec_unalign_load_cost;
8603
8604       case unaligned_store:
8605       case vector_scatter_store:
8606         return costs->vec_unalign_store_cost;
8607
8608       case cond_branch_taken:
8609         return costs->cond_taken_branch_cost;
8610
8611       case cond_branch_not_taken:
8612         return costs->cond_not_taken_branch_cost;
8613
8614       case vec_perm:
8615         return costs->vec_permute_cost;
8616
8617       case vec_promote_demote:
8618         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8619
8620       case vec_construct:
8621         elements = TYPE_VECTOR_SUBPARTS (vectype);
8622         return elements / 2 + 1;
8623
8624       default:
8625         gcc_unreachable ();
8626     }
8627 }
8628
8629 /* Implement targetm.vectorize.add_stmt_cost.  */
8630 static unsigned
8631 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8632                        struct _stmt_vec_info *stmt_info, int misalign,
8633                        enum vect_cost_model_location where)
8634 {
8635   unsigned *cost = (unsigned *) data;
8636   unsigned retval = 0;
8637
8638   if (flag_vect_cost_model)
8639     {
8640       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8641       int stmt_cost =
8642             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8643
8644       /* Statements in an inner loop relative to the loop being
8645          vectorized are weighted more heavily.  The value here is
8646          arbitrary and could potentially be improved with analysis.  */
8647       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8648         count *= 50; /*  FIXME  */
8649
8650       retval = (unsigned) (count * stmt_cost);
8651       cost[where] += retval;
8652     }
8653
8654   return retval;
8655 }
8656
8657 static void initialize_aarch64_code_model (struct gcc_options *);
8658
8659 /* Parse the TO_PARSE string and put the architecture struct that it
8660    selects into RES and the architectural features into ISA_FLAGS.
8661    Return an aarch64_parse_opt_result describing the parse result.
8662    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8663
8664 static enum aarch64_parse_opt_result
8665 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8666                     unsigned long *isa_flags)
8667 {
8668   char *ext;
8669   const struct processor *arch;
8670   char *str = (char *) alloca (strlen (to_parse) + 1);
8671   size_t len;
8672
8673   strcpy (str, to_parse);
8674
8675   ext = strchr (str, '+');
8676
8677   if (ext != NULL)
8678     len = ext - str;
8679   else
8680     len = strlen (str);
8681
8682   if (len == 0)
8683     return AARCH64_PARSE_MISSING_ARG;
8684
8685
8686   /* Loop through the list of supported ARCHes to find a match.  */
8687   for (arch = all_architectures; arch->name != NULL; arch++)
8688     {
8689       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8690         {
8691           unsigned long isa_temp = arch->flags;
8692
8693           if (ext != NULL)
8694             {
8695               /* TO_PARSE string contains at least one extension.  */
8696               enum aarch64_parse_opt_result ext_res
8697                 = aarch64_parse_extension (ext, &isa_temp);
8698
8699               if (ext_res != AARCH64_PARSE_OK)
8700                 return ext_res;
8701             }
8702           /* Extension parsing was successful.  Confirm the result
8703              arch and ISA flags.  */
8704           *res = arch;
8705           *isa_flags = isa_temp;
8706           return AARCH64_PARSE_OK;
8707         }
8708     }
8709
8710   /* ARCH name not found in list.  */
8711   return AARCH64_PARSE_INVALID_ARG;
8712 }
8713
8714 /* Parse the TO_PARSE string and put the result tuning in RES and the
8715    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8716    describing the parse result.  If there is an error parsing, RES and
8717    ISA_FLAGS are left unchanged.  */
8718
8719 static enum aarch64_parse_opt_result
8720 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8721                    unsigned long *isa_flags)
8722 {
8723   char *ext;
8724   const struct processor *cpu;
8725   char *str = (char *) alloca (strlen (to_parse) + 1);
8726   size_t len;
8727
8728   strcpy (str, to_parse);
8729
8730   ext = strchr (str, '+');
8731
8732   if (ext != NULL)
8733     len = ext - str;
8734   else
8735     len = strlen (str);
8736
8737   if (len == 0)
8738     return AARCH64_PARSE_MISSING_ARG;
8739
8740
8741   /* Loop through the list of supported CPUs to find a match.  */
8742   for (cpu = all_cores; cpu->name != NULL; cpu++)
8743     {
8744       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8745         {
8746           unsigned long isa_temp = cpu->flags;
8747
8748
8749           if (ext != NULL)
8750             {
8751               /* TO_PARSE string contains at least one extension.  */
8752               enum aarch64_parse_opt_result ext_res
8753                 = aarch64_parse_extension (ext, &isa_temp);
8754
8755               if (ext_res != AARCH64_PARSE_OK)
8756                 return ext_res;
8757             }
8758           /* Extension parsing was successfull.  Confirm the result
8759              cpu and ISA flags.  */
8760           *res = cpu;
8761           *isa_flags = isa_temp;
8762           return AARCH64_PARSE_OK;
8763         }
8764     }
8765
8766   /* CPU name not found in list.  */
8767   return AARCH64_PARSE_INVALID_ARG;
8768 }
8769
8770 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8771    Return an aarch64_parse_opt_result describing the parse result.
8772    If the parsing fails the RES does not change.  */
8773
8774 static enum aarch64_parse_opt_result
8775 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8776 {
8777   const struct processor *cpu;
8778   char *str = (char *) alloca (strlen (to_parse) + 1);
8779
8780   strcpy (str, to_parse);
8781
8782   /* Loop through the list of supported CPUs to find a match.  */
8783   for (cpu = all_cores; cpu->name != NULL; cpu++)
8784     {
8785       if (strcmp (cpu->name, str) == 0)
8786         {
8787           *res = cpu;
8788           return AARCH64_PARSE_OK;
8789         }
8790     }
8791
8792   /* CPU name not found in list.  */
8793   return AARCH64_PARSE_INVALID_ARG;
8794 }
8795
8796 /* Parse TOKEN, which has length LENGTH to see if it is an option
8797    described in FLAG.  If it is, return the index bit for that fusion type.
8798    If not, error (printing OPTION_NAME) and return zero.  */
8799
8800 static unsigned int
8801 aarch64_parse_one_option_token (const char *token,
8802                                 size_t length,
8803                                 const struct aarch64_flag_desc *flag,
8804                                 const char *option_name)
8805 {
8806   for (; flag->name != NULL; flag++)
8807     {
8808       if (length == strlen (flag->name)
8809           && !strncmp (flag->name, token, length))
8810         return flag->flag;
8811     }
8812
8813   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8814   return 0;
8815 }
8816
8817 /* Parse OPTION which is a comma-separated list of flags to enable.
8818    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8819    default state we inherit from the CPU tuning structures.  OPTION_NAME
8820    gives the top-level option we are parsing in the -moverride string,
8821    for use in error messages.  */
8822
8823 static unsigned int
8824 aarch64_parse_boolean_options (const char *option,
8825                                const struct aarch64_flag_desc *flags,
8826                                unsigned int initial_state,
8827                                const char *option_name)
8828 {
8829   const char separator = '.';
8830   const char* specs = option;
8831   const char* ntoken = option;
8832   unsigned int found_flags = initial_state;
8833
8834   while ((ntoken = strchr (specs, separator)))
8835     {
8836       size_t token_length = ntoken - specs;
8837       unsigned token_ops = aarch64_parse_one_option_token (specs,
8838                                                            token_length,
8839                                                            flags,
8840                                                            option_name);
8841       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8842          in the token stream, reset the supported operations.  So:
8843
8844            adrp+add.cmp+branch.none.adrp+add
8845
8846            would have the result of turning on only adrp+add fusion.  */
8847       if (!token_ops)
8848         found_flags = 0;
8849
8850       found_flags |= token_ops;
8851       specs = ++ntoken;
8852     }
8853
8854   /* We ended with a comma, print something.  */
8855   if (!(*specs))
8856     {
8857       error ("%s string ill-formed\n", option_name);
8858       return 0;
8859     }
8860
8861   /* We still have one more token to parse.  */
8862   size_t token_length = strlen (specs);
8863   unsigned token_ops = aarch64_parse_one_option_token (specs,
8864                                                        token_length,
8865                                                        flags,
8866                                                        option_name);
8867    if (!token_ops)
8868      found_flags = 0;
8869
8870   found_flags |= token_ops;
8871   return found_flags;
8872 }
8873
8874 /* Support for overriding instruction fusion.  */
8875
8876 static void
8877 aarch64_parse_fuse_string (const char *fuse_string,
8878                             struct tune_params *tune)
8879 {
8880   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8881                                                      aarch64_fusible_pairs,
8882                                                      tune->fusible_ops,
8883                                                      "fuse=");
8884 }
8885
8886 /* Support for overriding other tuning flags.  */
8887
8888 static void
8889 aarch64_parse_tune_string (const char *tune_string,
8890                             struct tune_params *tune)
8891 {
8892   tune->extra_tuning_flags
8893     = aarch64_parse_boolean_options (tune_string,
8894                                      aarch64_tuning_flags,
8895                                      tune->extra_tuning_flags,
8896                                      "tune=");
8897 }
8898
8899 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8900    we understand.  If it is, extract the option string and handoff to
8901    the appropriate function.  */
8902
8903 void
8904 aarch64_parse_one_override_token (const char* token,
8905                                   size_t length,
8906                                   struct tune_params *tune)
8907 {
8908   const struct aarch64_tuning_override_function *fn
8909     = aarch64_tuning_override_functions;
8910
8911   const char *option_part = strchr (token, '=');
8912   if (!option_part)
8913     {
8914       error ("tuning string missing in option (%s)", token);
8915       return;
8916     }
8917
8918   /* Get the length of the option name.  */
8919   length = option_part - token;
8920   /* Skip the '=' to get to the option string.  */
8921   option_part++;
8922
8923   for (; fn->name != NULL; fn++)
8924     {
8925       if (!strncmp (fn->name, token, length))
8926         {
8927           fn->parse_override (option_part, tune);
8928           return;
8929         }
8930     }
8931
8932   error ("unknown tuning option (%s)",token);
8933   return;
8934 }
8935
8936 /* A checking mechanism for the implementation of the tls size.  */
8937
8938 static void
8939 initialize_aarch64_tls_size (struct gcc_options *opts)
8940 {
8941   if (aarch64_tls_size == 0)
8942     aarch64_tls_size = 24;
8943
8944   switch (opts->x_aarch64_cmodel_var)
8945     {
8946     case AARCH64_CMODEL_TINY:
8947       /* Both the default and maximum TLS size allowed under tiny is 1M which
8948          needs two instructions to address, so we clamp the size to 24.  */
8949       if (aarch64_tls_size > 24)
8950         aarch64_tls_size = 24;
8951       break;
8952     case AARCH64_CMODEL_SMALL:
8953       /* The maximum TLS size allowed under small is 4G.  */
8954       if (aarch64_tls_size > 32)
8955         aarch64_tls_size = 32;
8956       break;
8957     case AARCH64_CMODEL_LARGE:
8958       /* The maximum TLS size allowed under large is 16E.
8959          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8960       if (aarch64_tls_size > 48)
8961         aarch64_tls_size = 48;
8962       break;
8963     default:
8964       gcc_unreachable ();
8965     }
8966
8967   return;
8968 }
8969
8970 /* Parse STRING looking for options in the format:
8971      string     :: option:string
8972      option     :: name=substring
8973      name       :: {a-z}
8974      substring  :: defined by option.  */
8975
8976 static void
8977 aarch64_parse_override_string (const char* input_string,
8978                                struct tune_params* tune)
8979 {
8980   const char separator = ':';
8981   size_t string_length = strlen (input_string) + 1;
8982   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8983   char *string = string_root;
8984   strncpy (string, input_string, string_length);
8985   string[string_length - 1] = '\0';
8986
8987   char* ntoken = string;
8988
8989   while ((ntoken = strchr (string, separator)))
8990     {
8991       size_t token_length = ntoken - string;
8992       /* Make this substring look like a string.  */
8993       *ntoken = '\0';
8994       aarch64_parse_one_override_token (string, token_length, tune);
8995       string = ++ntoken;
8996     }
8997
8998   /* One last option to parse.  */
8999   aarch64_parse_one_override_token (string, strlen (string), tune);
9000   free (string_root);
9001 }
9002
9003
9004 static void
9005 aarch64_override_options_after_change_1 (struct gcc_options *opts)
9006 {
9007   /* PR 70044: We have to be careful about being called multiple times for the
9008      same function.  This means all changes should be repeatable.  */
9009
9010   /* If the frame pointer is enabled, set it to a special value that behaves
9011      similar to frame pointer omission.  If we don't do this all leaf functions
9012      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
9013      If flag_omit_frame_pointer has this special value, we must force the
9014      frame pointer if not in a leaf function.  We also need to force it in a
9015      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
9016   if (opts->x_flag_omit_frame_pointer == 0)
9017     opts->x_flag_omit_frame_pointer = 2;
9018
9019   /* If not optimizing for size, set the default
9020      alignment to what the target wants.  */
9021   if (!opts->x_optimize_size)
9022     {
9023       if (opts->x_align_loops <= 0)
9024         opts->x_align_loops = aarch64_tune_params.loop_align;
9025       if (opts->x_align_jumps <= 0)
9026         opts->x_align_jumps = aarch64_tune_params.jump_align;
9027       if (opts->x_align_functions <= 0)
9028         opts->x_align_functions = aarch64_tune_params.function_align;
9029     }
9030
9031   /* We default to no pc-relative literal loads.  */
9032
9033   aarch64_pcrelative_literal_loads = false;
9034
9035   /* If -mpc-relative-literal-loads is set on the command line, this
9036      implies that the user asked for PC relative literal loads.  */
9037   if (opts->x_pcrelative_literal_loads == 1)
9038     aarch64_pcrelative_literal_loads = true;
9039
9040   /* In the tiny memory model it makes no sense to disallow PC relative
9041      literal pool loads.  */
9042   if (aarch64_cmodel == AARCH64_CMODEL_TINY
9043       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9044     aarch64_pcrelative_literal_loads = true;
9045
9046   /* When enabling the lower precision Newton series for the square root, also
9047      enable it for the reciprocal square root, since the latter is an
9048      intermediary step for the former.  */
9049   if (flag_mlow_precision_sqrt)
9050     flag_mrecip_low_precision_sqrt = true;
9051 }
9052
9053 /* 'Unpack' up the internal tuning structs and update the options
9054     in OPTS.  The caller must have set up selected_tune and selected_arch
9055     as all the other target-specific codegen decisions are
9056     derived from them.  */
9057
9058 void
9059 aarch64_override_options_internal (struct gcc_options *opts)
9060 {
9061   aarch64_tune_flags = selected_tune->flags;
9062   aarch64_tune = selected_tune->sched_core;
9063   /* Make a copy of the tuning parameters attached to the core, which
9064      we may later overwrite.  */
9065   aarch64_tune_params = *(selected_tune->tune);
9066   aarch64_architecture_version = selected_arch->architecture_version;
9067
9068   if (opts->x_aarch64_override_tune_string)
9069     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9070                                   &aarch64_tune_params);
9071
9072   /* This target defaults to strict volatile bitfields.  */
9073   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9074     opts->x_flag_strict_volatile_bitfields = 1;
9075
9076   initialize_aarch64_code_model (opts);
9077   initialize_aarch64_tls_size (opts);
9078
9079   int queue_depth = 0;
9080   switch (aarch64_tune_params.autoprefetcher_model)
9081     {
9082       case tune_params::AUTOPREFETCHER_OFF:
9083         queue_depth = -1;
9084         break;
9085       case tune_params::AUTOPREFETCHER_WEAK:
9086         queue_depth = 0;
9087         break;
9088       case tune_params::AUTOPREFETCHER_STRONG:
9089         queue_depth = max_insn_queue_index + 1;
9090         break;
9091       default:
9092         gcc_unreachable ();
9093     }
9094
9095   /* We don't mind passing in global_options_set here as we don't use
9096      the *options_set structs anyway.  */
9097   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9098                          queue_depth,
9099                          opts->x_param_values,
9100                          global_options_set.x_param_values);
9101
9102   /* Set up parameters to be used in prefetching algorithm.  Do not
9103      override the defaults unless we are tuning for a core we have
9104      researched values for.  */
9105   if (aarch64_tune_params.prefetch->num_slots > 0)
9106     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9107                            aarch64_tune_params.prefetch->num_slots,
9108                            opts->x_param_values,
9109                            global_options_set.x_param_values);
9110   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9111     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9112                            aarch64_tune_params.prefetch->l1_cache_size,
9113                            opts->x_param_values,
9114                            global_options_set.x_param_values);
9115   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9116     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9117                            aarch64_tune_params.prefetch->l1_cache_line_size,
9118                            opts->x_param_values,
9119                            global_options_set.x_param_values);
9120   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9121     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9122                            aarch64_tune_params.prefetch->l2_cache_size,
9123                            opts->x_param_values,
9124                            global_options_set.x_param_values);
9125
9126   /* Use the alternative scheduling-pressure algorithm by default.  */
9127   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
9128                          opts->x_param_values,
9129                          global_options_set.x_param_values);
9130
9131   /* Enable sw prefetching at specified optimization level for
9132      CPUS that have prefetch.  Lower optimization level threshold by 1
9133      when profiling is enabled.  */
9134   if (opts->x_flag_prefetch_loop_arrays < 0
9135       && !opts->x_optimize_size
9136       && aarch64_tune_params.prefetch->default_opt_level >= 0
9137       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9138     opts->x_flag_prefetch_loop_arrays = 1;
9139
9140   aarch64_override_options_after_change_1 (opts);
9141 }
9142
9143 /* Print a hint with a suggestion for a core or architecture name that
9144    most closely resembles what the user passed in STR.  ARCH is true if
9145    the user is asking for an architecture name.  ARCH is false if the user
9146    is asking for a core name.  */
9147
9148 static void
9149 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9150 {
9151   auto_vec<const char *> candidates;
9152   const struct processor *entry = arch ? all_architectures : all_cores;
9153   for (; entry->name != NULL; entry++)
9154     candidates.safe_push (entry->name);
9155   char *s;
9156   const char *hint = candidates_list_and_hint (str, s, candidates);
9157   if (hint)
9158     inform (input_location, "valid arguments are: %s;"
9159                              " did you mean %qs?", s, hint);
9160   XDELETEVEC (s);
9161 }
9162
9163 /* Print a hint with a suggestion for a core name that most closely resembles
9164    what the user passed in STR.  */
9165
9166 inline static void
9167 aarch64_print_hint_for_core (const char *str)
9168 {
9169   aarch64_print_hint_for_core_or_arch (str, false);
9170 }
9171
9172 /* Print a hint with a suggestion for an architecture name that most closely
9173    resembles what the user passed in STR.  */
9174
9175 inline static void
9176 aarch64_print_hint_for_arch (const char *str)
9177 {
9178   aarch64_print_hint_for_core_or_arch (str, true);
9179 }
9180
9181 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9182    specified in STR and throw errors if appropriate.  Put the results if
9183    they are valid in RES and ISA_FLAGS.  Return whether the option is
9184    valid.  */
9185
9186 static bool
9187 aarch64_validate_mcpu (const char *str, const struct processor **res,
9188                        unsigned long *isa_flags)
9189 {
9190   enum aarch64_parse_opt_result parse_res
9191     = aarch64_parse_cpu (str, res, isa_flags);
9192
9193   if (parse_res == AARCH64_PARSE_OK)
9194     return true;
9195
9196   switch (parse_res)
9197     {
9198       case AARCH64_PARSE_MISSING_ARG:
9199         error ("missing cpu name in %<-mcpu=%s%>", str);
9200         break;
9201       case AARCH64_PARSE_INVALID_ARG:
9202         error ("unknown value %qs for -mcpu", str);
9203         aarch64_print_hint_for_core (str);
9204         break;
9205       case AARCH64_PARSE_INVALID_FEATURE:
9206         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9207         break;
9208       default:
9209         gcc_unreachable ();
9210     }
9211
9212   return false;
9213 }
9214
9215 /* Validate a command-line -march option.  Parse the arch and extensions
9216    (if any) specified in STR and throw errors if appropriate.  Put the
9217    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9218    option is valid.  */
9219
9220 static bool
9221 aarch64_validate_march (const char *str, const struct processor **res,
9222                          unsigned long *isa_flags)
9223 {
9224   enum aarch64_parse_opt_result parse_res
9225     = aarch64_parse_arch (str, res, isa_flags);
9226
9227   if (parse_res == AARCH64_PARSE_OK)
9228     return true;
9229
9230   switch (parse_res)
9231     {
9232       case AARCH64_PARSE_MISSING_ARG:
9233         error ("missing arch name in %<-march=%s%>", str);
9234         break;
9235       case AARCH64_PARSE_INVALID_ARG:
9236         error ("unknown value %qs for -march", str);
9237         aarch64_print_hint_for_arch (str);
9238         break;
9239       case AARCH64_PARSE_INVALID_FEATURE:
9240         error ("invalid feature modifier in %<-march=%s%>", str);
9241         break;
9242       default:
9243         gcc_unreachable ();
9244     }
9245
9246   return false;
9247 }
9248
9249 /* Validate a command-line -mtune option.  Parse the cpu
9250    specified in STR and throw errors if appropriate.  Put the
9251    result, if it is valid, in RES.  Return whether the option is
9252    valid.  */
9253
9254 static bool
9255 aarch64_validate_mtune (const char *str, const struct processor **res)
9256 {
9257   enum aarch64_parse_opt_result parse_res
9258     = aarch64_parse_tune (str, res);
9259
9260   if (parse_res == AARCH64_PARSE_OK)
9261     return true;
9262
9263   switch (parse_res)
9264     {
9265       case AARCH64_PARSE_MISSING_ARG:
9266         error ("missing cpu name in %<-mtune=%s%>", str);
9267         break;
9268       case AARCH64_PARSE_INVALID_ARG:
9269         error ("unknown value %qs for -mtune", str);
9270         aarch64_print_hint_for_core (str);
9271         break;
9272       default:
9273         gcc_unreachable ();
9274     }
9275   return false;
9276 }
9277
9278 /* Return the CPU corresponding to the enum CPU.
9279    If it doesn't specify a cpu, return the default.  */
9280
9281 static const struct processor *
9282 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9283 {
9284   if (cpu != aarch64_none)
9285     return &all_cores[cpu];
9286
9287   /* The & 0x3f is to extract the bottom 6 bits that encode the
9288      default cpu as selected by the --with-cpu GCC configure option
9289      in config.gcc.
9290      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9291      flags mechanism should be reworked to make it more sane.  */
9292   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9293 }
9294
9295 /* Return the architecture corresponding to the enum ARCH.
9296    If it doesn't specify a valid architecture, return the default.  */
9297
9298 static const struct processor *
9299 aarch64_get_arch (enum aarch64_arch arch)
9300 {
9301   if (arch != aarch64_no_arch)
9302     return &all_architectures[arch];
9303
9304   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9305
9306   return &all_architectures[cpu->arch];
9307 }
9308
9309 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9310    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9311    tuning structs.  In particular it must set selected_tune and
9312    aarch64_isa_flags that define the available ISA features and tuning
9313    decisions.  It must also set selected_arch as this will be used to
9314    output the .arch asm tags for each function.  */
9315
9316 static void
9317 aarch64_override_options (void)
9318 {
9319   unsigned long cpu_isa = 0;
9320   unsigned long arch_isa = 0;
9321   aarch64_isa_flags = 0;
9322
9323   bool valid_cpu = true;
9324   bool valid_tune = true;
9325   bool valid_arch = true;
9326
9327   selected_cpu = NULL;
9328   selected_arch = NULL;
9329   selected_tune = NULL;
9330
9331   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9332      If either of -march or -mtune is given, they override their
9333      respective component of -mcpu.  */
9334   if (aarch64_cpu_string)
9335     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9336                                         &cpu_isa);
9337
9338   if (aarch64_arch_string)
9339     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9340                                           &arch_isa);
9341
9342   if (aarch64_tune_string)
9343     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9344
9345   /* If the user did not specify a processor, choose the default
9346      one for them.  This will be the CPU set during configuration using
9347      --with-cpu, otherwise it is "generic".  */
9348   if (!selected_cpu)
9349     {
9350       if (selected_arch)
9351         {
9352           selected_cpu = &all_cores[selected_arch->ident];
9353           aarch64_isa_flags = arch_isa;
9354           explicit_arch = selected_arch->arch;
9355         }
9356       else
9357         {
9358           /* Get default configure-time CPU.  */
9359           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9360           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9361         }
9362
9363       if (selected_tune)
9364         explicit_tune_core = selected_tune->ident;
9365     }
9366   /* If both -mcpu and -march are specified check that they are architecturally
9367      compatible, warn if they're not and prefer the -march ISA flags.  */
9368   else if (selected_arch)
9369     {
9370       if (selected_arch->arch != selected_cpu->arch)
9371         {
9372           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9373                        all_architectures[selected_cpu->arch].name,
9374                        selected_arch->name);
9375         }
9376       aarch64_isa_flags = arch_isa;
9377       explicit_arch = selected_arch->arch;
9378       explicit_tune_core = selected_tune ? selected_tune->ident
9379                                           : selected_cpu->ident;
9380     }
9381   else
9382     {
9383       /* -mcpu but no -march.  */
9384       aarch64_isa_flags = cpu_isa;
9385       explicit_tune_core = selected_tune ? selected_tune->ident
9386                                           : selected_cpu->ident;
9387       gcc_assert (selected_cpu);
9388       selected_arch = &all_architectures[selected_cpu->arch];
9389       explicit_arch = selected_arch->arch;
9390     }
9391
9392   /* Set the arch as well as we will need it when outputing
9393      the .arch directive in assembly.  */
9394   if (!selected_arch)
9395     {
9396       gcc_assert (selected_cpu);
9397       selected_arch = &all_architectures[selected_cpu->arch];
9398     }
9399
9400   if (!selected_tune)
9401     selected_tune = selected_cpu;
9402
9403 #ifndef HAVE_AS_MABI_OPTION
9404   /* The compiler may have been configured with 2.23.* binutils, which does
9405      not have support for ILP32.  */
9406   if (TARGET_ILP32)
9407     error ("Assembler does not support -mabi=ilp32");
9408 #endif
9409
9410   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9411     sorry ("Return address signing is only supported for -mabi=lp64");
9412
9413   /* Make sure we properly set up the explicit options.  */
9414   if ((aarch64_cpu_string && valid_cpu)
9415        || (aarch64_tune_string && valid_tune))
9416     gcc_assert (explicit_tune_core != aarch64_none);
9417
9418   if ((aarch64_cpu_string && valid_cpu)
9419        || (aarch64_arch_string && valid_arch))
9420     gcc_assert (explicit_arch != aarch64_no_arch);
9421
9422   aarch64_override_options_internal (&global_options);
9423
9424   /* Save these options as the default ones in case we push and pop them later
9425      while processing functions with potential target attributes.  */
9426   target_option_default_node = target_option_current_node
9427       = build_target_option_node (&global_options);
9428 }
9429
9430 /* Implement targetm.override_options_after_change.  */
9431
9432 static void
9433 aarch64_override_options_after_change (void)
9434 {
9435   aarch64_override_options_after_change_1 (&global_options);
9436 }
9437
9438 static struct machine_function *
9439 aarch64_init_machine_status (void)
9440 {
9441   struct machine_function *machine;
9442   machine = ggc_cleared_alloc<machine_function> ();
9443   return machine;
9444 }
9445
9446 void
9447 aarch64_init_expanders (void)
9448 {
9449   init_machine_status = aarch64_init_machine_status;
9450 }
9451
9452 /* A checking mechanism for the implementation of the various code models.  */
9453 static void
9454 initialize_aarch64_code_model (struct gcc_options *opts)
9455 {
9456    if (opts->x_flag_pic)
9457      {
9458        switch (opts->x_aarch64_cmodel_var)
9459          {
9460          case AARCH64_CMODEL_TINY:
9461            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9462            break;
9463          case AARCH64_CMODEL_SMALL:
9464 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9465            aarch64_cmodel = (flag_pic == 2
9466                              ? AARCH64_CMODEL_SMALL_PIC
9467                              : AARCH64_CMODEL_SMALL_SPIC);
9468 #else
9469            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9470 #endif
9471            break;
9472          case AARCH64_CMODEL_LARGE:
9473            sorry ("code model %qs with -f%s", "large",
9474                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9475            break;
9476          default:
9477            gcc_unreachable ();
9478          }
9479      }
9480    else
9481      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9482 }
9483
9484 /* Implement TARGET_OPTION_SAVE.  */
9485
9486 static void
9487 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9488 {
9489   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9490 }
9491
9492 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9493    using the information saved in PTR.  */
9494
9495 static void
9496 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9497 {
9498   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9499   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9500   opts->x_explicit_arch = ptr->x_explicit_arch;
9501   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9502   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9503
9504   aarch64_override_options_internal (opts);
9505 }
9506
9507 /* Implement TARGET_OPTION_PRINT.  */
9508
9509 static void
9510 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9511 {
9512   const struct processor *cpu
9513     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9514   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9515   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9516   std::string extension
9517     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9518
9519   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9520   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9521            arch->name, extension.c_str ());
9522 }
9523
9524 static GTY(()) tree aarch64_previous_fndecl;
9525
9526 void
9527 aarch64_reset_previous_fndecl (void)
9528 {
9529   aarch64_previous_fndecl = NULL;
9530 }
9531
9532 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9533    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9534    make sure optab availability predicates are recomputed when necessary.  */
9535
9536 void
9537 aarch64_save_restore_target_globals (tree new_tree)
9538 {
9539   if (TREE_TARGET_GLOBALS (new_tree))
9540     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9541   else if (new_tree == target_option_default_node)
9542     restore_target_globals (&default_target_globals);
9543   else
9544     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9545 }
9546
9547 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9548    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9549    of the function, if such exists.  This function may be called multiple
9550    times on a single function so use aarch64_previous_fndecl to avoid
9551    setting up identical state.  */
9552
9553 static void
9554 aarch64_set_current_function (tree fndecl)
9555 {
9556   if (!fndecl || fndecl == aarch64_previous_fndecl)
9557     return;
9558
9559   tree old_tree = (aarch64_previous_fndecl
9560                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9561                    : NULL_TREE);
9562
9563   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9564
9565   /* If current function has no attributes but the previous one did,
9566      use the default node.  */
9567   if (!new_tree && old_tree)
9568     new_tree = target_option_default_node;
9569
9570   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9571      the default have been handled by aarch64_save_restore_target_globals from
9572      aarch64_pragma_target_parse.  */
9573   if (old_tree == new_tree)
9574     return;
9575
9576   aarch64_previous_fndecl = fndecl;
9577
9578   /* First set the target options.  */
9579   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9580
9581   aarch64_save_restore_target_globals (new_tree);
9582 }
9583
9584 /* Enum describing the various ways we can handle attributes.
9585    In many cases we can reuse the generic option handling machinery.  */
9586
9587 enum aarch64_attr_opt_type
9588 {
9589   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9590   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9591   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9592   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9593 };
9594
9595 /* All the information needed to handle a target attribute.
9596    NAME is the name of the attribute.
9597    ATTR_TYPE specifies the type of behavior of the attribute as described
9598    in the definition of enum aarch64_attr_opt_type.
9599    ALLOW_NEG is true if the attribute supports a "no-" form.
9600    HANDLER is the function that takes the attribute string as an argument
9601    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
9602    OPT_NUM is the enum specifying the option that the attribute modifies.
9603    This is needed for attributes that mirror the behavior of a command-line
9604    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9605    aarch64_attr_enum.  */
9606
9607 struct aarch64_attribute_info
9608 {
9609   const char *name;
9610   enum aarch64_attr_opt_type attr_type;
9611   bool allow_neg;
9612   bool (*handler) (const char *);
9613   enum opt_code opt_num;
9614 };
9615
9616 /* Handle the ARCH_STR argument to the arch= target attribute.  */
9617
9618 static bool
9619 aarch64_handle_attr_arch (const char *str)
9620 {
9621   const struct processor *tmp_arch = NULL;
9622   enum aarch64_parse_opt_result parse_res
9623     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9624
9625   if (parse_res == AARCH64_PARSE_OK)
9626     {
9627       gcc_assert (tmp_arch);
9628       selected_arch = tmp_arch;
9629       explicit_arch = selected_arch->arch;
9630       return true;
9631     }
9632
9633   switch (parse_res)
9634     {
9635       case AARCH64_PARSE_MISSING_ARG:
9636         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
9637         break;
9638       case AARCH64_PARSE_INVALID_ARG:
9639         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
9640         aarch64_print_hint_for_arch (str);
9641         break;
9642       case AARCH64_PARSE_INVALID_FEATURE:
9643         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
9644         break;
9645       default:
9646         gcc_unreachable ();
9647     }
9648
9649   return false;
9650 }
9651
9652 /* Handle the argument CPU_STR to the cpu= target attribute.  */
9653
9654 static bool
9655 aarch64_handle_attr_cpu (const char *str)
9656 {
9657   const struct processor *tmp_cpu = NULL;
9658   enum aarch64_parse_opt_result parse_res
9659     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9660
9661   if (parse_res == AARCH64_PARSE_OK)
9662     {
9663       gcc_assert (tmp_cpu);
9664       selected_tune = tmp_cpu;
9665       explicit_tune_core = selected_tune->ident;
9666
9667       selected_arch = &all_architectures[tmp_cpu->arch];
9668       explicit_arch = selected_arch->arch;
9669       return true;
9670     }
9671
9672   switch (parse_res)
9673     {
9674       case AARCH64_PARSE_MISSING_ARG:
9675         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
9676         break;
9677       case AARCH64_PARSE_INVALID_ARG:
9678         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
9679         aarch64_print_hint_for_core (str);
9680         break;
9681       case AARCH64_PARSE_INVALID_FEATURE:
9682         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
9683         break;
9684       default:
9685         gcc_unreachable ();
9686     }
9687
9688   return false;
9689 }
9690
9691 /* Handle the argument STR to the tune= target attribute.  */
9692
9693 static bool
9694 aarch64_handle_attr_tune (const char *str)
9695 {
9696   const struct processor *tmp_tune = NULL;
9697   enum aarch64_parse_opt_result parse_res
9698     = aarch64_parse_tune (str, &tmp_tune);
9699
9700   if (parse_res == AARCH64_PARSE_OK)
9701     {
9702       gcc_assert (tmp_tune);
9703       selected_tune = tmp_tune;
9704       explicit_tune_core = selected_tune->ident;
9705       return true;
9706     }
9707
9708   switch (parse_res)
9709     {
9710       case AARCH64_PARSE_INVALID_ARG:
9711         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
9712         aarch64_print_hint_for_core (str);
9713         break;
9714       default:
9715         gcc_unreachable ();
9716     }
9717
9718   return false;
9719 }
9720
9721 /* Parse an architecture extensions target attribute string specified in STR.
9722    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9723    if successful.  Update aarch64_isa_flags to reflect the ISA features
9724    modified.  */
9725
9726 static bool
9727 aarch64_handle_attr_isa_flags (char *str)
9728 {
9729   enum aarch64_parse_opt_result parse_res;
9730   unsigned long isa_flags = aarch64_isa_flags;
9731
9732   /* We allow "+nothing" in the beginning to clear out all architectural
9733      features if the user wants to handpick specific features.  */
9734   if (strncmp ("+nothing", str, 8) == 0)
9735     {
9736       isa_flags = 0;
9737       str += 8;
9738     }
9739
9740   parse_res = aarch64_parse_extension (str, &isa_flags);
9741
9742   if (parse_res == AARCH64_PARSE_OK)
9743     {
9744       aarch64_isa_flags = isa_flags;
9745       return true;
9746     }
9747
9748   switch (parse_res)
9749     {
9750       case AARCH64_PARSE_MISSING_ARG:
9751         error ("missing value in %<target()%> pragma or attribute");
9752         break;
9753
9754       case AARCH64_PARSE_INVALID_FEATURE:
9755         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
9756         break;
9757
9758       default:
9759         gcc_unreachable ();
9760     }
9761
9762  return false;
9763 }
9764
9765 /* The target attributes that we support.  On top of these we also support just
9766    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9767    handled explicitly in aarch64_process_one_target_attr.  */
9768
9769 static const struct aarch64_attribute_info aarch64_attributes[] =
9770 {
9771   { "general-regs-only", aarch64_attr_mask, false, NULL,
9772      OPT_mgeneral_regs_only },
9773   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9774      OPT_mfix_cortex_a53_835769 },
9775   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9776      OPT_mfix_cortex_a53_843419 },
9777   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9778   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9779   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9780      OPT_momit_leaf_frame_pointer },
9781   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9782   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9783      OPT_march_ },
9784   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9785   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9786      OPT_mtune_ },
9787   { "sign-return-address", aarch64_attr_enum, false, NULL,
9788      OPT_msign_return_address_ },
9789   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9790 };
9791
9792 /* Parse ARG_STR which contains the definition of one target attribute.
9793    Show appropriate errors if any or return true if the attribute is valid.  */
9794
9795 static bool
9796 aarch64_process_one_target_attr (char *arg_str)
9797 {
9798   bool invert = false;
9799
9800   size_t len = strlen (arg_str);
9801
9802   if (len == 0)
9803     {
9804       error ("malformed %<target()%> pragma or attribute");
9805       return false;
9806     }
9807
9808   char *str_to_check = (char *) alloca (len + 1);
9809   strcpy (str_to_check, arg_str);
9810
9811   /* Skip leading whitespace.  */
9812   while (*str_to_check == ' ' || *str_to_check == '\t')
9813     str_to_check++;
9814
9815   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9816      It is easier to detect and handle it explicitly here rather than going
9817      through the machinery for the rest of the target attributes in this
9818      function.  */
9819   if (*str_to_check == '+')
9820     return aarch64_handle_attr_isa_flags (str_to_check);
9821
9822   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9823     {
9824       invert = true;
9825       str_to_check += 3;
9826     }
9827   char *arg = strchr (str_to_check, '=');
9828
9829   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9830      and point ARG to "foo".  */
9831   if (arg)
9832     {
9833       *arg = '\0';
9834       arg++;
9835     }
9836   const struct aarch64_attribute_info *p_attr;
9837   bool found = false;
9838   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9839     {
9840       /* If the names don't match up, or the user has given an argument
9841          to an attribute that doesn't accept one, or didn't give an argument
9842          to an attribute that expects one, fail to match.  */
9843       if (strcmp (str_to_check, p_attr->name) != 0)
9844         continue;
9845
9846       found = true;
9847       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9848                               || p_attr->attr_type == aarch64_attr_enum;
9849
9850       if (attr_need_arg_p ^ (arg != NULL))
9851         {
9852           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
9853           return false;
9854         }
9855
9856       /* If the name matches but the attribute does not allow "no-" versions
9857          then we can't match.  */
9858       if (invert && !p_attr->allow_neg)
9859         {
9860           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
9861           return false;
9862         }
9863
9864       switch (p_attr->attr_type)
9865         {
9866         /* Has a custom handler registered.
9867            For example, cpu=, arch=, tune=.  */
9868           case aarch64_attr_custom:
9869             gcc_assert (p_attr->handler);
9870             if (!p_attr->handler (arg))
9871               return false;
9872             break;
9873
9874           /* Either set or unset a boolean option.  */
9875           case aarch64_attr_bool:
9876             {
9877               struct cl_decoded_option decoded;
9878
9879               generate_option (p_attr->opt_num, NULL, !invert,
9880                                CL_TARGET, &decoded);
9881               aarch64_handle_option (&global_options, &global_options_set,
9882                                       &decoded, input_location);
9883               break;
9884             }
9885           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9886              should know what mask to apply given the option number.  */
9887           case aarch64_attr_mask:
9888             {
9889               struct cl_decoded_option decoded;
9890               /* We only need to specify the option number.
9891                  aarch64_handle_option will know which mask to apply.  */
9892               decoded.opt_index = p_attr->opt_num;
9893               decoded.value = !invert;
9894               aarch64_handle_option (&global_options, &global_options_set,
9895                                       &decoded, input_location);
9896               break;
9897             }
9898           /* Use the option setting machinery to set an option to an enum.  */
9899           case aarch64_attr_enum:
9900             {
9901               gcc_assert (arg);
9902               bool valid;
9903               int value;
9904               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9905                                               &value, CL_TARGET);
9906               if (valid)
9907                 {
9908                   set_option (&global_options, NULL, p_attr->opt_num, value,
9909                               NULL, DK_UNSPECIFIED, input_location,
9910                               global_dc);
9911                 }
9912               else
9913                 {
9914                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
9915                 }
9916               break;
9917             }
9918           default:
9919             gcc_unreachable ();
9920         }
9921     }
9922
9923   /* If we reached here we either have found an attribute and validated
9924      it or didn't match any.  If we matched an attribute but its arguments
9925      were malformed we will have returned false already.  */
9926   return found;
9927 }
9928
9929 /* Count how many times the character C appears in
9930    NULL-terminated string STR.  */
9931
9932 static unsigned int
9933 num_occurences_in_str (char c, char *str)
9934 {
9935   unsigned int res = 0;
9936   while (*str != '\0')
9937     {
9938       if (*str == c)
9939         res++;
9940
9941       str++;
9942     }
9943
9944   return res;
9945 }
9946
9947 /* Parse the tree in ARGS that contains the target attribute information
9948    and update the global target options space.  */
9949
9950 bool
9951 aarch64_process_target_attr (tree args)
9952 {
9953   if (TREE_CODE (args) == TREE_LIST)
9954     {
9955       do
9956         {
9957           tree head = TREE_VALUE (args);
9958           if (head)
9959             {
9960               if (!aarch64_process_target_attr (head))
9961                 return false;
9962             }
9963           args = TREE_CHAIN (args);
9964         } while (args);
9965
9966       return true;
9967     }
9968
9969   if (TREE_CODE (args) != STRING_CST)
9970     {
9971       error ("attribute %<target%> argument not a string");
9972       return false;
9973     }
9974
9975   size_t len = strlen (TREE_STRING_POINTER (args));
9976   char *str_to_check = (char *) alloca (len + 1);
9977   strcpy (str_to_check, TREE_STRING_POINTER (args));
9978
9979   if (len == 0)
9980     {
9981       error ("malformed %<target()%> pragma or attribute");
9982       return false;
9983     }
9984
9985   /* Used to catch empty spaces between commas i.e.
9986      attribute ((target ("attr1,,attr2"))).  */
9987   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9988
9989   /* Handle multiple target attributes separated by ','.  */
9990   char *token = strtok (str_to_check, ",");
9991
9992   unsigned int num_attrs = 0;
9993   while (token)
9994     {
9995       num_attrs++;
9996       if (!aarch64_process_one_target_attr (token))
9997         {
9998           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
9999           return false;
10000         }
10001
10002       token = strtok (NULL, ",");
10003     }
10004
10005   if (num_attrs != num_commas + 1)
10006     {
10007       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
10008       return false;
10009     }
10010
10011   return true;
10012 }
10013
10014 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
10015    process attribute ((target ("..."))).  */
10016
10017 static bool
10018 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
10019 {
10020   struct cl_target_option cur_target;
10021   bool ret;
10022   tree old_optimize;
10023   tree new_target, new_optimize;
10024   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10025
10026   /* If what we're processing is the current pragma string then the
10027      target option node is already stored in target_option_current_node
10028      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
10029      having to re-parse the string.  This is especially useful to keep
10030      arm_neon.h compile times down since that header contains a lot
10031      of intrinsics enclosed in pragmas.  */
10032   if (!existing_target && args == current_target_pragma)
10033     {
10034       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
10035       return true;
10036     }
10037   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10038
10039   old_optimize = build_optimization_node (&global_options);
10040   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10041
10042   /* If the function changed the optimization levels as well as setting
10043      target options, start with the optimizations specified.  */
10044   if (func_optimize && func_optimize != old_optimize)
10045     cl_optimization_restore (&global_options,
10046                              TREE_OPTIMIZATION (func_optimize));
10047
10048   /* Save the current target options to restore at the end.  */
10049   cl_target_option_save (&cur_target, &global_options);
10050
10051   /* If fndecl already has some target attributes applied to it, unpack
10052      them so that we add this attribute on top of them, rather than
10053      overwriting them.  */
10054   if (existing_target)
10055     {
10056       struct cl_target_option *existing_options
10057         = TREE_TARGET_OPTION (existing_target);
10058
10059       if (existing_options)
10060         cl_target_option_restore (&global_options, existing_options);
10061     }
10062   else
10063     cl_target_option_restore (&global_options,
10064                         TREE_TARGET_OPTION (target_option_current_node));
10065
10066   ret = aarch64_process_target_attr (args);
10067
10068   /* Set up any additional state.  */
10069   if (ret)
10070     {
10071       aarch64_override_options_internal (&global_options);
10072       /* Initialize SIMD builtins if we haven't already.
10073          Set current_target_pragma to NULL for the duration so that
10074          the builtin initialization code doesn't try to tag the functions
10075          being built with the attributes specified by any current pragma, thus
10076          going into an infinite recursion.  */
10077       if (TARGET_SIMD)
10078         {
10079           tree saved_current_target_pragma = current_target_pragma;
10080           current_target_pragma = NULL;
10081           aarch64_init_simd_builtins ();
10082           current_target_pragma = saved_current_target_pragma;
10083         }
10084       new_target = build_target_option_node (&global_options);
10085     }
10086   else
10087     new_target = NULL;
10088
10089   new_optimize = build_optimization_node (&global_options);
10090
10091   if (fndecl && ret)
10092     {
10093       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10094
10095       if (old_optimize != new_optimize)
10096         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10097     }
10098
10099   cl_target_option_restore (&global_options, &cur_target);
10100
10101   if (old_optimize != new_optimize)
10102     cl_optimization_restore (&global_options,
10103                              TREE_OPTIMIZATION (old_optimize));
10104   return ret;
10105 }
10106
10107 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10108    tri-bool options (yes, no, don't care) and the default value is
10109    DEF, determine whether to reject inlining.  */
10110
10111 static bool
10112 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10113                                      int dont_care, int def)
10114 {
10115   /* If the callee doesn't care, always allow inlining.  */
10116   if (callee == dont_care)
10117     return true;
10118
10119   /* If the caller doesn't care, always allow inlining.  */
10120   if (caller == dont_care)
10121     return true;
10122
10123   /* Otherwise, allow inlining if either the callee and caller values
10124      agree, or if the callee is using the default value.  */
10125   return (callee == caller || callee == def);
10126 }
10127
10128 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10129    to inline CALLEE into CALLER based on target-specific info.
10130    Make sure that the caller and callee have compatible architectural
10131    features.  Then go through the other possible target attributes
10132    and see if they can block inlining.  Try not to reject always_inline
10133    callees unless they are incompatible architecturally.  */
10134
10135 static bool
10136 aarch64_can_inline_p (tree caller, tree callee)
10137 {
10138   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10139   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10140
10141   /* If callee has no option attributes, then it is ok to inline.  */
10142   if (!callee_tree)
10143     return true;
10144
10145   struct cl_target_option *caller_opts
10146         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10147                                            : target_option_default_node);
10148
10149   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10150
10151
10152   /* Callee's ISA flags should be a subset of the caller's.  */
10153   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10154        != callee_opts->x_aarch64_isa_flags)
10155     return false;
10156
10157   /* Allow non-strict aligned functions inlining into strict
10158      aligned ones.  */
10159   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10160        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10161       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10162            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10163     return false;
10164
10165   bool always_inline = lookup_attribute ("always_inline",
10166                                           DECL_ATTRIBUTES (callee));
10167
10168   /* If the architectural features match up and the callee is always_inline
10169      then the other attributes don't matter.  */
10170   if (always_inline)
10171     return true;
10172
10173   if (caller_opts->x_aarch64_cmodel_var
10174       != callee_opts->x_aarch64_cmodel_var)
10175     return false;
10176
10177   if (caller_opts->x_aarch64_tls_dialect
10178       != callee_opts->x_aarch64_tls_dialect)
10179     return false;
10180
10181   /* Honour explicit requests to workaround errata.  */
10182   if (!aarch64_tribools_ok_for_inlining_p (
10183           caller_opts->x_aarch64_fix_a53_err835769,
10184           callee_opts->x_aarch64_fix_a53_err835769,
10185           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10186     return false;
10187
10188   if (!aarch64_tribools_ok_for_inlining_p (
10189           caller_opts->x_aarch64_fix_a53_err843419,
10190           callee_opts->x_aarch64_fix_a53_err843419,
10191           2, TARGET_FIX_ERR_A53_843419))
10192     return false;
10193
10194   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10195      caller and calle and they don't match up, reject inlining.  */
10196   if (!aarch64_tribools_ok_for_inlining_p (
10197           caller_opts->x_flag_omit_leaf_frame_pointer,
10198           callee_opts->x_flag_omit_leaf_frame_pointer,
10199           2, 1))
10200     return false;
10201
10202   /* If the callee has specific tuning overrides, respect them.  */
10203   if (callee_opts->x_aarch64_override_tune_string != NULL
10204       && caller_opts->x_aarch64_override_tune_string == NULL)
10205     return false;
10206
10207   /* If the user specified tuning override strings for the
10208      caller and callee and they don't match up, reject inlining.
10209      We just do a string compare here, we don't analyze the meaning
10210      of the string, as it would be too costly for little gain.  */
10211   if (callee_opts->x_aarch64_override_tune_string
10212       && caller_opts->x_aarch64_override_tune_string
10213       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10214                   caller_opts->x_aarch64_override_tune_string) != 0))
10215     return false;
10216
10217   return true;
10218 }
10219
10220 /* Return true if SYMBOL_REF X binds locally.  */
10221
10222 static bool
10223 aarch64_symbol_binds_local_p (const_rtx x)
10224 {
10225   return (SYMBOL_REF_DECL (x)
10226           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10227           : SYMBOL_REF_LOCAL_P (x));
10228 }
10229
10230 /* Return true if SYMBOL_REF X is thread local */
10231 static bool
10232 aarch64_tls_symbol_p (rtx x)
10233 {
10234   if (! TARGET_HAVE_TLS)
10235     return false;
10236
10237   if (GET_CODE (x) != SYMBOL_REF)
10238     return false;
10239
10240   return SYMBOL_REF_TLS_MODEL (x) != 0;
10241 }
10242
10243 /* Classify a TLS symbol into one of the TLS kinds.  */
10244 enum aarch64_symbol_type
10245 aarch64_classify_tls_symbol (rtx x)
10246 {
10247   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10248
10249   switch (tls_kind)
10250     {
10251     case TLS_MODEL_GLOBAL_DYNAMIC:
10252     case TLS_MODEL_LOCAL_DYNAMIC:
10253       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10254
10255     case TLS_MODEL_INITIAL_EXEC:
10256       switch (aarch64_cmodel)
10257         {
10258         case AARCH64_CMODEL_TINY:
10259         case AARCH64_CMODEL_TINY_PIC:
10260           return SYMBOL_TINY_TLSIE;
10261         default:
10262           return SYMBOL_SMALL_TLSIE;
10263         }
10264
10265     case TLS_MODEL_LOCAL_EXEC:
10266       if (aarch64_tls_size == 12)
10267         return SYMBOL_TLSLE12;
10268       else if (aarch64_tls_size == 24)
10269         return SYMBOL_TLSLE24;
10270       else if (aarch64_tls_size == 32)
10271         return SYMBOL_TLSLE32;
10272       else if (aarch64_tls_size == 48)
10273         return SYMBOL_TLSLE48;
10274       else
10275         gcc_unreachable ();
10276
10277     case TLS_MODEL_EMULATED:
10278     case TLS_MODEL_NONE:
10279       return SYMBOL_FORCE_TO_MEM;
10280
10281     default:
10282       gcc_unreachable ();
10283     }
10284 }
10285
10286 /* Return the method that should be used to access SYMBOL_REF or
10287    LABEL_REF X.  */
10288
10289 enum aarch64_symbol_type
10290 aarch64_classify_symbol (rtx x, rtx offset)
10291 {
10292   if (GET_CODE (x) == LABEL_REF)
10293     {
10294       switch (aarch64_cmodel)
10295         {
10296         case AARCH64_CMODEL_LARGE:
10297           return SYMBOL_FORCE_TO_MEM;
10298
10299         case AARCH64_CMODEL_TINY_PIC:
10300         case AARCH64_CMODEL_TINY:
10301           return SYMBOL_TINY_ABSOLUTE;
10302
10303         case AARCH64_CMODEL_SMALL_SPIC:
10304         case AARCH64_CMODEL_SMALL_PIC:
10305         case AARCH64_CMODEL_SMALL:
10306           return SYMBOL_SMALL_ABSOLUTE;
10307
10308         default:
10309           gcc_unreachable ();
10310         }
10311     }
10312
10313   if (GET_CODE (x) == SYMBOL_REF)
10314     {
10315       if (aarch64_tls_symbol_p (x))
10316         return aarch64_classify_tls_symbol (x);
10317
10318       switch (aarch64_cmodel)
10319         {
10320         case AARCH64_CMODEL_TINY:
10321           /* When we retrieve symbol + offset address, we have to make sure
10322              the offset does not cause overflow of the final address.  But
10323              we have no way of knowing the address of symbol at compile time
10324              so we can't accurately say if the distance between the PC and
10325              symbol + offset is outside the addressible range of +/-1M in the
10326              TINY code model.  So we rely on images not being greater than
10327              1M and cap the offset at 1M and anything beyond 1M will have to
10328              be loaded using an alternative mechanism.  Furthermore if the
10329              symbol is a weak reference to something that isn't known to
10330              resolve to a symbol in this module, then force to memory.  */
10331           if ((SYMBOL_REF_WEAK (x)
10332                && !aarch64_symbol_binds_local_p (x))
10333               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10334             return SYMBOL_FORCE_TO_MEM;
10335           return SYMBOL_TINY_ABSOLUTE;
10336
10337         case AARCH64_CMODEL_SMALL:
10338           /* Same reasoning as the tiny code model, but the offset cap here is
10339              4G.  */
10340           if ((SYMBOL_REF_WEAK (x)
10341                && !aarch64_symbol_binds_local_p (x))
10342               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10343                             HOST_WIDE_INT_C (4294967264)))
10344             return SYMBOL_FORCE_TO_MEM;
10345           return SYMBOL_SMALL_ABSOLUTE;
10346
10347         case AARCH64_CMODEL_TINY_PIC:
10348           if (!aarch64_symbol_binds_local_p (x))
10349             return SYMBOL_TINY_GOT;
10350           return SYMBOL_TINY_ABSOLUTE;
10351
10352         case AARCH64_CMODEL_SMALL_SPIC:
10353         case AARCH64_CMODEL_SMALL_PIC:
10354           if (!aarch64_symbol_binds_local_p (x))
10355             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10356                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10357           return SYMBOL_SMALL_ABSOLUTE;
10358
10359         case AARCH64_CMODEL_LARGE:
10360           /* This is alright even in PIC code as the constant
10361              pool reference is always PC relative and within
10362              the same translation unit.  */
10363           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10364             return SYMBOL_SMALL_ABSOLUTE;
10365           else
10366             return SYMBOL_FORCE_TO_MEM;
10367
10368         default:
10369           gcc_unreachable ();
10370         }
10371     }
10372
10373   /* By default push everything into the constant pool.  */
10374   return SYMBOL_FORCE_TO_MEM;
10375 }
10376
10377 bool
10378 aarch64_constant_address_p (rtx x)
10379 {
10380   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10381 }
10382
10383 bool
10384 aarch64_legitimate_pic_operand_p (rtx x)
10385 {
10386   if (GET_CODE (x) == SYMBOL_REF
10387       || (GET_CODE (x) == CONST
10388           && GET_CODE (XEXP (x, 0)) == PLUS
10389           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10390      return false;
10391
10392   return true;
10393 }
10394
10395 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
10396    that should be rematerialized rather than spilled.  */
10397
10398 static bool
10399 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10400 {
10401   /* Support CSE and rematerialization of common constants.  */
10402   if (CONST_INT_P (x) || CONST_DOUBLE_P (x) || GET_CODE (x) == CONST_VECTOR)
10403     return true;
10404
10405   /* Do not allow vector struct mode constants.  We could support
10406      0 and -1 easily, but they need support in aarch64-simd.md.  */
10407   if (aarch64_vect_struct_mode_p (mode))
10408     return false;
10409
10410   /* Do not allow wide int constants - this requires support in movti.  */
10411   if (CONST_WIDE_INT_P (x))
10412     return false;
10413
10414   /* Do not allow const (plus (anchor_symbol, const_int)).  */
10415   if (GET_CODE (x) == CONST)
10416     {
10417       rtx offset;
10418
10419       split_const (x, &x, &offset);
10420
10421       if (SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
10422         return false;
10423     }
10424
10425   if (GET_CODE (x) == HIGH)
10426     x = XEXP (x, 0);
10427
10428   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10429      so spilling them is better than rematerialization.  */
10430   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10431     return true;
10432
10433   /* Label references are always constant.  */
10434   if (GET_CODE (x) == LABEL_REF)
10435     return true;
10436
10437   return false;
10438 }
10439
10440 rtx
10441 aarch64_load_tp (rtx target)
10442 {
10443   if (!target
10444       || GET_MODE (target) != Pmode
10445       || !register_operand (target, Pmode))
10446     target = gen_reg_rtx (Pmode);
10447
10448   /* Can return in any reg.  */
10449   emit_insn (gen_aarch64_load_tp_hard (target));
10450   return target;
10451 }
10452
10453 /* On AAPCS systems, this is the "struct __va_list".  */
10454 static GTY(()) tree va_list_type;
10455
10456 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10457    Return the type to use as __builtin_va_list.
10458
10459    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10460
10461    struct __va_list
10462    {
10463      void *__stack;
10464      void *__gr_top;
10465      void *__vr_top;
10466      int   __gr_offs;
10467      int   __vr_offs;
10468    };  */
10469
10470 static tree
10471 aarch64_build_builtin_va_list (void)
10472 {
10473   tree va_list_name;
10474   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10475
10476   /* Create the type.  */
10477   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10478   /* Give it the required name.  */
10479   va_list_name = build_decl (BUILTINS_LOCATION,
10480                              TYPE_DECL,
10481                              get_identifier ("__va_list"),
10482                              va_list_type);
10483   DECL_ARTIFICIAL (va_list_name) = 1;
10484   TYPE_NAME (va_list_type) = va_list_name;
10485   TYPE_STUB_DECL (va_list_type) = va_list_name;
10486
10487   /* Create the fields.  */
10488   f_stack = build_decl (BUILTINS_LOCATION,
10489                         FIELD_DECL, get_identifier ("__stack"),
10490                         ptr_type_node);
10491   f_grtop = build_decl (BUILTINS_LOCATION,
10492                         FIELD_DECL, get_identifier ("__gr_top"),
10493                         ptr_type_node);
10494   f_vrtop = build_decl (BUILTINS_LOCATION,
10495                         FIELD_DECL, get_identifier ("__vr_top"),
10496                         ptr_type_node);
10497   f_groff = build_decl (BUILTINS_LOCATION,
10498                         FIELD_DECL, get_identifier ("__gr_offs"),
10499                         integer_type_node);
10500   f_vroff = build_decl (BUILTINS_LOCATION,
10501                         FIELD_DECL, get_identifier ("__vr_offs"),
10502                         integer_type_node);
10503
10504   /* Tell tree-stdarg pass about our internal offset fields.
10505      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10506      purpose to identify whether the code is updating va_list internal
10507      offset fields through irregular way.  */
10508   va_list_gpr_counter_field = f_groff;
10509   va_list_fpr_counter_field = f_vroff;
10510
10511   DECL_ARTIFICIAL (f_stack) = 1;
10512   DECL_ARTIFICIAL (f_grtop) = 1;
10513   DECL_ARTIFICIAL (f_vrtop) = 1;
10514   DECL_ARTIFICIAL (f_groff) = 1;
10515   DECL_ARTIFICIAL (f_vroff) = 1;
10516
10517   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10518   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10519   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10520   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10521   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10522
10523   TYPE_FIELDS (va_list_type) = f_stack;
10524   DECL_CHAIN (f_stack) = f_grtop;
10525   DECL_CHAIN (f_grtop) = f_vrtop;
10526   DECL_CHAIN (f_vrtop) = f_groff;
10527   DECL_CHAIN (f_groff) = f_vroff;
10528
10529   /* Compute its layout.  */
10530   layout_type (va_list_type);
10531
10532   return va_list_type;
10533 }
10534
10535 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10536 static void
10537 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10538 {
10539   const CUMULATIVE_ARGS *cum;
10540   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10541   tree stack, grtop, vrtop, groff, vroff;
10542   tree t;
10543   int gr_save_area_size = cfun->va_list_gpr_size;
10544   int vr_save_area_size = cfun->va_list_fpr_size;
10545   int vr_offset;
10546
10547   cum = &crtl->args.info;
10548   if (cfun->va_list_gpr_size)
10549     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10550                              cfun->va_list_gpr_size);
10551   if (cfun->va_list_fpr_size)
10552     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10553                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10554
10555   if (!TARGET_FLOAT)
10556     {
10557       gcc_assert (cum->aapcs_nvrn == 0);
10558       vr_save_area_size = 0;
10559     }
10560
10561   f_stack = TYPE_FIELDS (va_list_type_node);
10562   f_grtop = DECL_CHAIN (f_stack);
10563   f_vrtop = DECL_CHAIN (f_grtop);
10564   f_groff = DECL_CHAIN (f_vrtop);
10565   f_vroff = DECL_CHAIN (f_groff);
10566
10567   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10568                   NULL_TREE);
10569   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10570                   NULL_TREE);
10571   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10572                   NULL_TREE);
10573   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10574                   NULL_TREE);
10575   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10576                   NULL_TREE);
10577
10578   /* Emit code to initialize STACK, which points to the next varargs stack
10579      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10580      by named arguments.  STACK is 8-byte aligned.  */
10581   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10582   if (cum->aapcs_stack_size > 0)
10583     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10584   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10585   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10586
10587   /* Emit code to initialize GRTOP, the top of the GR save area.
10588      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10589   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10590   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10591   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10592
10593   /* Emit code to initialize VRTOP, the top of the VR save area.
10594      This address is gr_save_area_bytes below GRTOP, rounded
10595      down to the next 16-byte boundary.  */
10596   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10597   vr_offset = ROUND_UP (gr_save_area_size,
10598                         STACK_BOUNDARY / BITS_PER_UNIT);
10599
10600   if (vr_offset)
10601     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10602   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10603   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10604
10605   /* Emit code to initialize GROFF, the offset from GRTOP of the
10606      next GPR argument.  */
10607   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10608               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10609   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10610
10611   /* Likewise emit code to initialize VROFF, the offset from FTOP
10612      of the next VR argument.  */
10613   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10614               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10615   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10616 }
10617
10618 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10619
10620 static tree
10621 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10622                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10623 {
10624   tree addr;
10625   bool indirect_p;
10626   bool is_ha;           /* is HFA or HVA.  */
10627   bool dw_align;        /* double-word align.  */
10628   machine_mode ag_mode = VOIDmode;
10629   int nregs;
10630   machine_mode mode;
10631
10632   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10633   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10634   HOST_WIDE_INT size, rsize, adjust, align;
10635   tree t, u, cond1, cond2;
10636
10637   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10638   if (indirect_p)
10639     type = build_pointer_type (type);
10640
10641   mode = TYPE_MODE (type);
10642
10643   f_stack = TYPE_FIELDS (va_list_type_node);
10644   f_grtop = DECL_CHAIN (f_stack);
10645   f_vrtop = DECL_CHAIN (f_grtop);
10646   f_groff = DECL_CHAIN (f_vrtop);
10647   f_vroff = DECL_CHAIN (f_groff);
10648
10649   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10650                   f_stack, NULL_TREE);
10651   size = int_size_in_bytes (type);
10652   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10653
10654   dw_align = false;
10655   adjust = 0;
10656   if (aarch64_vfp_is_call_or_return_candidate (mode,
10657                                                type,
10658                                                &ag_mode,
10659                                                &nregs,
10660                                                &is_ha))
10661     {
10662       /* TYPE passed in fp/simd registers.  */
10663       if (!TARGET_FLOAT)
10664         aarch64_err_no_fpadvsimd (mode, "varargs");
10665
10666       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10667                       unshare_expr (valist), f_vrtop, NULL_TREE);
10668       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10669                       unshare_expr (valist), f_vroff, NULL_TREE);
10670
10671       rsize = nregs * UNITS_PER_VREG;
10672
10673       if (is_ha)
10674         {
10675           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10676             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10677         }
10678       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10679                && size < UNITS_PER_VREG)
10680         {
10681           adjust = UNITS_PER_VREG - size;
10682         }
10683     }
10684   else
10685     {
10686       /* TYPE passed in general registers.  */
10687       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10688                       unshare_expr (valist), f_grtop, NULL_TREE);
10689       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10690                       unshare_expr (valist), f_groff, NULL_TREE);
10691       rsize = ROUND_UP (size, UNITS_PER_WORD);
10692       nregs = rsize / UNITS_PER_WORD;
10693
10694       if (align > 8)
10695         dw_align = true;
10696
10697       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10698           && size < UNITS_PER_WORD)
10699         {
10700           adjust = UNITS_PER_WORD  - size;
10701         }
10702     }
10703
10704   /* Get a local temporary for the field value.  */
10705   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10706
10707   /* Emit code to branch if off >= 0.  */
10708   t = build2 (GE_EXPR, boolean_type_node, off,
10709               build_int_cst (TREE_TYPE (off), 0));
10710   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10711
10712   if (dw_align)
10713     {
10714       /* Emit: offs = (offs + 15) & -16.  */
10715       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10716                   build_int_cst (TREE_TYPE (off), 15));
10717       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10718                   build_int_cst (TREE_TYPE (off), -16));
10719       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10720     }
10721   else
10722     roundup = NULL;
10723
10724   /* Update ap.__[g|v]r_offs  */
10725   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10726               build_int_cst (TREE_TYPE (off), rsize));
10727   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10728
10729   /* String up.  */
10730   if (roundup)
10731     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10732
10733   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10734   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10735               build_int_cst (TREE_TYPE (f_off), 0));
10736   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10737
10738   /* String up: make sure the assignment happens before the use.  */
10739   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10740   COND_EXPR_ELSE (cond1) = t;
10741
10742   /* Prepare the trees handling the argument that is passed on the stack;
10743      the top level node will store in ON_STACK.  */
10744   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10745   if (align > 8)
10746     {
10747       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10748       t = fold_convert (intDI_type_node, arg);
10749       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10750                   build_int_cst (TREE_TYPE (t), 15));
10751       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10752                   build_int_cst (TREE_TYPE (t), -16));
10753       t = fold_convert (TREE_TYPE (arg), t);
10754       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10755     }
10756   else
10757     roundup = NULL;
10758   /* Advance ap.__stack  */
10759   t = fold_convert (intDI_type_node, arg);
10760   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10761               build_int_cst (TREE_TYPE (t), size + 7));
10762   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10763               build_int_cst (TREE_TYPE (t), -8));
10764   t = fold_convert (TREE_TYPE (arg), t);
10765   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10766   /* String up roundup and advance.  */
10767   if (roundup)
10768     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10769   /* String up with arg */
10770   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10771   /* Big-endianness related address adjustment.  */
10772   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10773       && size < UNITS_PER_WORD)
10774   {
10775     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10776                 size_int (UNITS_PER_WORD - size));
10777     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10778   }
10779
10780   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10781   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10782
10783   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10784   t = off;
10785   if (adjust)
10786     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10787                 build_int_cst (TREE_TYPE (off), adjust));
10788
10789   t = fold_convert (sizetype, t);
10790   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10791
10792   if (is_ha)
10793     {
10794       /* type ha; // treat as "struct {ftype field[n];}"
10795          ... [computing offs]
10796          for (i = 0; i <nregs; ++i, offs += 16)
10797            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10798          return ha;  */
10799       int i;
10800       tree tmp_ha, field_t, field_ptr_t;
10801
10802       /* Declare a local variable.  */
10803       tmp_ha = create_tmp_var_raw (type, "ha");
10804       gimple_add_tmp_var (tmp_ha);
10805
10806       /* Establish the base type.  */
10807       switch (ag_mode)
10808         {
10809         case E_SFmode:
10810           field_t = float_type_node;
10811           field_ptr_t = float_ptr_type_node;
10812           break;
10813         case E_DFmode:
10814           field_t = double_type_node;
10815           field_ptr_t = double_ptr_type_node;
10816           break;
10817         case E_TFmode:
10818           field_t = long_double_type_node;
10819           field_ptr_t = long_double_ptr_type_node;
10820           break;
10821         case E_HFmode:
10822           field_t = aarch64_fp16_type_node;
10823           field_ptr_t = aarch64_fp16_ptr_type_node;
10824           break;
10825         case E_V2SImode:
10826         case E_V4SImode:
10827             {
10828               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10829               field_t = build_vector_type_for_mode (innertype, ag_mode);
10830               field_ptr_t = build_pointer_type (field_t);
10831             }
10832           break;
10833         default:
10834           gcc_assert (0);
10835         }
10836
10837       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10838       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10839       addr = t;
10840       t = fold_convert (field_ptr_t, addr);
10841       t = build2 (MODIFY_EXPR, field_t,
10842                   build1 (INDIRECT_REF, field_t, tmp_ha),
10843                   build1 (INDIRECT_REF, field_t, t));
10844
10845       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10846       for (i = 1; i < nregs; ++i)
10847         {
10848           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10849           u = fold_convert (field_ptr_t, addr);
10850           u = build2 (MODIFY_EXPR, field_t,
10851                       build2 (MEM_REF, field_t, tmp_ha,
10852                               build_int_cst (field_ptr_t,
10853                                              (i *
10854                                               int_size_in_bytes (field_t)))),
10855                       build1 (INDIRECT_REF, field_t, u));
10856           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10857         }
10858
10859       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10860       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10861     }
10862
10863   COND_EXPR_ELSE (cond2) = t;
10864   addr = fold_convert (build_pointer_type (type), cond1);
10865   addr = build_va_arg_indirect_ref (addr);
10866
10867   if (indirect_p)
10868     addr = build_va_arg_indirect_ref (addr);
10869
10870   return addr;
10871 }
10872
10873 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10874
10875 static void
10876 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10877                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10878                                 int no_rtl)
10879 {
10880   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10881   CUMULATIVE_ARGS local_cum;
10882   int gr_saved = cfun->va_list_gpr_size;
10883   int vr_saved = cfun->va_list_fpr_size;
10884
10885   /* The caller has advanced CUM up to, but not beyond, the last named
10886      argument.  Advance a local copy of CUM past the last "real" named
10887      argument, to find out how many registers are left over.  */
10888   local_cum = *cum;
10889   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10890
10891   /* Found out how many registers we need to save.
10892      Honor tree-stdvar analysis results.  */
10893   if (cfun->va_list_gpr_size)
10894     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10895                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10896   if (cfun->va_list_fpr_size)
10897     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10898                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10899
10900   if (!TARGET_FLOAT)
10901     {
10902       gcc_assert (local_cum.aapcs_nvrn == 0);
10903       vr_saved = 0;
10904     }
10905
10906   if (!no_rtl)
10907     {
10908       if (gr_saved > 0)
10909         {
10910           rtx ptr, mem;
10911
10912           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10913           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10914                                - gr_saved * UNITS_PER_WORD);
10915           mem = gen_frame_mem (BLKmode, ptr);
10916           set_mem_alias_set (mem, get_varargs_alias_set ());
10917
10918           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10919                                mem, gr_saved);
10920         }
10921       if (vr_saved > 0)
10922         {
10923           /* We can't use move_block_from_reg, because it will use
10924              the wrong mode, storing D regs only.  */
10925           machine_mode mode = TImode;
10926           int off, i, vr_start;
10927
10928           /* Set OFF to the offset from virtual_incoming_args_rtx of
10929              the first vector register.  The VR save area lies below
10930              the GR one, and is aligned to 16 bytes.  */
10931           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10932                            STACK_BOUNDARY / BITS_PER_UNIT);
10933           off -= vr_saved * UNITS_PER_VREG;
10934
10935           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10936           for (i = 0; i < vr_saved; ++i)
10937             {
10938               rtx ptr, mem;
10939
10940               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10941               mem = gen_frame_mem (mode, ptr);
10942               set_mem_alias_set (mem, get_varargs_alias_set ());
10943               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10944               off += UNITS_PER_VREG;
10945             }
10946         }
10947     }
10948
10949   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10950      any complication of having crtl->args.pretend_args_size changed.  */
10951   cfun->machine->frame.saved_varargs_size
10952     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10953                  STACK_BOUNDARY / BITS_PER_UNIT)
10954        + vr_saved * UNITS_PER_VREG);
10955 }
10956
10957 static void
10958 aarch64_conditional_register_usage (void)
10959 {
10960   int i;
10961   if (!TARGET_FLOAT)
10962     {
10963       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10964         {
10965           fixed_regs[i] = 1;
10966           call_used_regs[i] = 1;
10967         }
10968     }
10969 }
10970
10971 /* Walk down the type tree of TYPE counting consecutive base elements.
10972    If *MODEP is VOIDmode, then set it to the first valid floating point
10973    type.  If a non-floating point type is found, or if a floating point
10974    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10975    otherwise return the count in the sub-tree.  */
10976 static int
10977 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10978 {
10979   machine_mode mode;
10980   HOST_WIDE_INT size;
10981
10982   switch (TREE_CODE (type))
10983     {
10984     case REAL_TYPE:
10985       mode = TYPE_MODE (type);
10986       if (mode != DFmode && mode != SFmode
10987           && mode != TFmode && mode != HFmode)
10988         return -1;
10989
10990       if (*modep == VOIDmode)
10991         *modep = mode;
10992
10993       if (*modep == mode)
10994         return 1;
10995
10996       break;
10997
10998     case COMPLEX_TYPE:
10999       mode = TYPE_MODE (TREE_TYPE (type));
11000       if (mode != DFmode && mode != SFmode
11001           && mode != TFmode && mode != HFmode)
11002         return -1;
11003
11004       if (*modep == VOIDmode)
11005         *modep = mode;
11006
11007       if (*modep == mode)
11008         return 2;
11009
11010       break;
11011
11012     case VECTOR_TYPE:
11013       /* Use V2SImode and V4SImode as representatives of all 64-bit
11014          and 128-bit vector types.  */
11015       size = int_size_in_bytes (type);
11016       switch (size)
11017         {
11018         case 8:
11019           mode = V2SImode;
11020           break;
11021         case 16:
11022           mode = V4SImode;
11023           break;
11024         default:
11025           return -1;
11026         }
11027
11028       if (*modep == VOIDmode)
11029         *modep = mode;
11030
11031       /* Vector modes are considered to be opaque: two vectors are
11032          equivalent for the purposes of being homogeneous aggregates
11033          if they are the same size.  */
11034       if (*modep == mode)
11035         return 1;
11036
11037       break;
11038
11039     case ARRAY_TYPE:
11040       {
11041         int count;
11042         tree index = TYPE_DOMAIN (type);
11043
11044         /* Can't handle incomplete types nor sizes that are not
11045            fixed.  */
11046         if (!COMPLETE_TYPE_P (type)
11047             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11048           return -1;
11049
11050         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11051         if (count == -1
11052             || !index
11053             || !TYPE_MAX_VALUE (index)
11054             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11055             || !TYPE_MIN_VALUE (index)
11056             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11057             || count < 0)
11058           return -1;
11059
11060         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11061                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11062
11063         /* There must be no padding.  */
11064         if (wi::to_wide (TYPE_SIZE (type))
11065             != count * GET_MODE_BITSIZE (*modep))
11066           return -1;
11067
11068         return count;
11069       }
11070
11071     case RECORD_TYPE:
11072       {
11073         int count = 0;
11074         int sub_count;
11075         tree field;
11076
11077         /* Can't handle incomplete types nor sizes that are not
11078            fixed.  */
11079         if (!COMPLETE_TYPE_P (type)
11080             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11081           return -1;
11082
11083         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11084           {
11085             if (TREE_CODE (field) != FIELD_DECL)
11086               continue;
11087
11088             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11089             if (sub_count < 0)
11090               return -1;
11091             count += sub_count;
11092           }
11093
11094         /* There must be no padding.  */
11095         if (wi::to_wide (TYPE_SIZE (type))
11096             != count * GET_MODE_BITSIZE (*modep))
11097           return -1;
11098
11099         return count;
11100       }
11101
11102     case UNION_TYPE:
11103     case QUAL_UNION_TYPE:
11104       {
11105         /* These aren't very interesting except in a degenerate case.  */
11106         int count = 0;
11107         int sub_count;
11108         tree field;
11109
11110         /* Can't handle incomplete types nor sizes that are not
11111            fixed.  */
11112         if (!COMPLETE_TYPE_P (type)
11113             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11114           return -1;
11115
11116         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11117           {
11118             if (TREE_CODE (field) != FIELD_DECL)
11119               continue;
11120
11121             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11122             if (sub_count < 0)
11123               return -1;
11124             count = count > sub_count ? count : sub_count;
11125           }
11126
11127         /* There must be no padding.  */
11128         if (wi::to_wide (TYPE_SIZE (type))
11129             != count * GET_MODE_BITSIZE (*modep))
11130           return -1;
11131
11132         return count;
11133       }
11134
11135     default:
11136       break;
11137     }
11138
11139   return -1;
11140 }
11141
11142 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11143    type as described in AAPCS64 \S 4.1.2.
11144
11145    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11146
11147 static bool
11148 aarch64_short_vector_p (const_tree type,
11149                         machine_mode mode)
11150 {
11151   HOST_WIDE_INT size = -1;
11152
11153   if (type && TREE_CODE (type) == VECTOR_TYPE)
11154     size = int_size_in_bytes (type);
11155   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11156             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11157     size = GET_MODE_SIZE (mode);
11158
11159   return (size == 8 || size == 16);
11160 }
11161
11162 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11163    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11164    array types.  The C99 floating-point complex types are also considered
11165    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11166    types, which are GCC extensions and out of the scope of AAPCS64, are
11167    treated as composite types here as well.
11168
11169    Note that MODE itself is not sufficient in determining whether a type
11170    is such a composite type or not.  This is because
11171    stor-layout.c:compute_record_mode may have already changed the MODE
11172    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11173    structure with only one field may have its MODE set to the mode of the
11174    field.  Also an integer mode whose size matches the size of the
11175    RECORD_TYPE type may be used to substitute the original mode
11176    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11177    solely relied on.  */
11178
11179 static bool
11180 aarch64_composite_type_p (const_tree type,
11181                           machine_mode mode)
11182 {
11183   if (aarch64_short_vector_p (type, mode))
11184     return false;
11185
11186   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11187     return true;
11188
11189   if (mode == BLKmode
11190       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11191       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11192     return true;
11193
11194   return false;
11195 }
11196
11197 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11198    shall be passed or returned in simd/fp register(s) (providing these
11199    parameter passing registers are available).
11200
11201    Upon successful return, *COUNT returns the number of needed registers,
11202    *BASE_MODE returns the mode of the individual register and when IS_HAF
11203    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11204    floating-point aggregate or a homogeneous short-vector aggregate.  */
11205
11206 static bool
11207 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11208                                          const_tree type,
11209                                          machine_mode *base_mode,
11210                                          int *count,
11211                                          bool *is_ha)
11212 {
11213   machine_mode new_mode = VOIDmode;
11214   bool composite_p = aarch64_composite_type_p (type, mode);
11215
11216   if (is_ha != NULL) *is_ha = false;
11217
11218   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11219       || aarch64_short_vector_p (type, mode))
11220     {
11221       *count = 1;
11222       new_mode = mode;
11223     }
11224   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11225     {
11226       if (is_ha != NULL) *is_ha = true;
11227       *count = 2;
11228       new_mode = GET_MODE_INNER (mode);
11229     }
11230   else if (type && composite_p)
11231     {
11232       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11233
11234       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11235         {
11236           if (is_ha != NULL) *is_ha = true;
11237           *count = ag_count;
11238         }
11239       else
11240         return false;
11241     }
11242   else
11243     return false;
11244
11245   *base_mode = new_mode;
11246   return true;
11247 }
11248
11249 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11250
11251 static rtx
11252 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11253                           int incoming ATTRIBUTE_UNUSED)
11254 {
11255   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11256 }
11257
11258 /* Implements target hook vector_mode_supported_p.  */
11259 static bool
11260 aarch64_vector_mode_supported_p (machine_mode mode)
11261 {
11262   if (TARGET_SIMD
11263       && (mode == V4SImode  || mode == V8HImode
11264           || mode == V16QImode || mode == V2DImode
11265           || mode == V2SImode  || mode == V4HImode
11266           || mode == V8QImode || mode == V2SFmode
11267           || mode == V4SFmode || mode == V2DFmode
11268           || mode == V4HFmode || mode == V8HFmode
11269           || mode == V1DFmode))
11270     return true;
11271
11272   return false;
11273 }
11274
11275 /* Return appropriate SIMD container
11276    for MODE within a vector of WIDTH bits.  */
11277 static machine_mode
11278 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11279 {
11280   gcc_assert (width == 64 || width == 128);
11281   if (TARGET_SIMD)
11282     {
11283       if (width == 128)
11284         switch (mode)
11285           {
11286           case E_DFmode:
11287             return V2DFmode;
11288           case E_SFmode:
11289             return V4SFmode;
11290           case E_HFmode:
11291             return V8HFmode;
11292           case E_SImode:
11293             return V4SImode;
11294           case E_HImode:
11295             return V8HImode;
11296           case E_QImode:
11297             return V16QImode;
11298           case E_DImode:
11299             return V2DImode;
11300           default:
11301             break;
11302           }
11303       else
11304         switch (mode)
11305           {
11306           case E_SFmode:
11307             return V2SFmode;
11308           case E_HFmode:
11309             return V4HFmode;
11310           case E_SImode:
11311             return V2SImode;
11312           case E_HImode:
11313             return V4HImode;
11314           case E_QImode:
11315             return V8QImode;
11316           default:
11317             break;
11318           }
11319     }
11320   return word_mode;
11321 }
11322
11323 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11324 static machine_mode
11325 aarch64_preferred_simd_mode (scalar_mode mode)
11326 {
11327   return aarch64_simd_container_mode (mode, 128);
11328 }
11329
11330 /* Return the bitmask of possible vector sizes for the vectorizer
11331    to iterate over.  */
11332 static unsigned int
11333 aarch64_autovectorize_vector_sizes (void)
11334 {
11335   return (16 | 8);
11336 }
11337
11338 /* Implement TARGET_MANGLE_TYPE.  */
11339
11340 static const char *
11341 aarch64_mangle_type (const_tree type)
11342 {
11343   /* The AArch64 ABI documents say that "__va_list" has to be
11344      managled as if it is in the "std" namespace.  */
11345   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11346     return "St9__va_list";
11347
11348   /* Half-precision float.  */
11349   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11350     return "Dh";
11351
11352   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11353      builtin types.  */
11354   if (TYPE_NAME (type) != NULL)
11355     return aarch64_mangle_builtin_type (type);
11356
11357   /* Use the default mangling.  */
11358   return NULL;
11359 }
11360
11361 /* Find the first rtx_insn before insn that will generate an assembly
11362    instruction.  */
11363
11364 static rtx_insn *
11365 aarch64_prev_real_insn (rtx_insn *insn)
11366 {
11367   if (!insn)
11368     return NULL;
11369
11370   do
11371     {
11372       insn = prev_real_insn (insn);
11373     }
11374   while (insn && recog_memoized (insn) < 0);
11375
11376   return insn;
11377 }
11378
11379 static bool
11380 is_madd_op (enum attr_type t1)
11381 {
11382   unsigned int i;
11383   /* A number of these may be AArch32 only.  */
11384   enum attr_type mlatypes[] = {
11385     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11386     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11387     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11388   };
11389
11390   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11391     {
11392       if (t1 == mlatypes[i])
11393         return true;
11394     }
11395
11396   return false;
11397 }
11398
11399 /* Check if there is a register dependency between a load and the insn
11400    for which we hold recog_data.  */
11401
11402 static bool
11403 dep_between_memop_and_curr (rtx memop)
11404 {
11405   rtx load_reg;
11406   int opno;
11407
11408   gcc_assert (GET_CODE (memop) == SET);
11409
11410   if (!REG_P (SET_DEST (memop)))
11411     return false;
11412
11413   load_reg = SET_DEST (memop);
11414   for (opno = 1; opno < recog_data.n_operands; opno++)
11415     {
11416       rtx operand = recog_data.operand[opno];
11417       if (REG_P (operand)
11418           && reg_overlap_mentioned_p (load_reg, operand))
11419         return true;
11420
11421     }
11422   return false;
11423 }
11424
11425
11426 /* When working around the Cortex-A53 erratum 835769,
11427    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11428    instruction and has a preceding memory instruction such that a NOP
11429    should be inserted between them.  */
11430
11431 bool
11432 aarch64_madd_needs_nop (rtx_insn* insn)
11433 {
11434   enum attr_type attr_type;
11435   rtx_insn *prev;
11436   rtx body;
11437
11438   if (!TARGET_FIX_ERR_A53_835769)
11439     return false;
11440
11441   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11442     return false;
11443
11444   attr_type = get_attr_type (insn);
11445   if (!is_madd_op (attr_type))
11446     return false;
11447
11448   prev = aarch64_prev_real_insn (insn);
11449   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11450      Restore recog state to INSN to avoid state corruption.  */
11451   extract_constrain_insn_cached (insn);
11452
11453   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11454     return false;
11455
11456   body = single_set (prev);
11457
11458   /* If the previous insn is a memory op and there is no dependency between
11459      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11460      have a complex memory operation, probably a load/store pair.
11461      Be conservative for now and emit a NOP.  */
11462   if (GET_MODE (recog_data.operand[0]) == DImode
11463       && (!body || !dep_between_memop_and_curr (body)))
11464     return true;
11465
11466   return false;
11467
11468 }
11469
11470
11471 /* Implement FINAL_PRESCAN_INSN.  */
11472
11473 void
11474 aarch64_final_prescan_insn (rtx_insn *insn)
11475 {
11476   if (aarch64_madd_needs_nop (insn))
11477     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11478 }
11479
11480
11481 /* Return the equivalent letter for size.  */
11482 static char
11483 sizetochar (int size)
11484 {
11485   switch (size)
11486     {
11487     case 64: return 'd';
11488     case 32: return 's';
11489     case 16: return 'h';
11490     case 8 : return 'b';
11491     default: gcc_unreachable ();
11492     }
11493 }
11494
11495 /* Return true iff x is a uniform vector of floating-point
11496    constants, and the constant can be represented in
11497    quarter-precision form.  Note, as aarch64_float_const_representable
11498    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11499 static bool
11500 aarch64_vect_float_const_representable_p (rtx x)
11501 {
11502   rtx elt;
11503   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11504           && const_vec_duplicate_p (x, &elt)
11505           && aarch64_float_const_representable_p (elt));
11506 }
11507
11508 /* Return true for valid and false for invalid.  */
11509 bool
11510 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11511                               struct simd_immediate_info *info,
11512                               enum simd_immediate_check which)
11513 {
11514 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11515   matches = 1;                                          \
11516   for (i = 0; i < idx; i += (STRIDE))                   \
11517     if (!(TEST))                                        \
11518       matches = 0;                                      \
11519   if (matches)                                          \
11520     {                                                   \
11521       immtype = (CLASS);                                \
11522       elsize = (ELSIZE);                                \
11523       eshift = (SHIFT);                                 \
11524       emvn = (NEG);                                     \
11525       break;                                            \
11526     }
11527
11528   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11529   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11530   unsigned char bytes[16];
11531   int immtype = -1, matches;
11532   unsigned int invmask = inverse ? 0xff : 0;
11533   int eshift, emvn;
11534
11535   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11536     {
11537       if (! (aarch64_simd_imm_zero_p (op, mode)
11538              || aarch64_vect_float_const_representable_p (op)))
11539         return false;
11540
11541       if (info)
11542         {
11543           rtx elt = CONST_VECTOR_ELT (op, 0);
11544           scalar_float_mode elt_mode
11545             = as_a <scalar_float_mode> (GET_MODE (elt));
11546
11547           info->value = elt;
11548           info->element_width = GET_MODE_BITSIZE (elt_mode);
11549           info->mvn = false;
11550           info->shift = 0;
11551         }
11552
11553       return true;
11554     }
11555
11556   /* Splat vector constant out into a byte vector.  */
11557   for (i = 0; i < n_elts; i++)
11558     {
11559       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11560          it must be laid out in the vector register in reverse order.  */
11561       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11562       unsigned HOST_WIDE_INT elpart;
11563
11564       gcc_assert (CONST_INT_P (el));
11565       elpart = INTVAL (el);
11566
11567       for (unsigned int byte = 0; byte < innersize; byte++)
11568         {
11569           bytes[idx++] = (elpart & 0xff) ^ invmask;
11570           elpart >>= BITS_PER_UNIT;
11571         }
11572
11573     }
11574
11575   /* Sanity check.  */
11576   gcc_assert (idx == GET_MODE_SIZE (mode));
11577
11578   do
11579     {
11580       if (which & AARCH64_CHECK_ORR)
11581         {
11582           CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11583                  && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11584
11585           CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11586                  && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11587
11588           CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11589                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11590
11591           CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11592                  && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11593
11594           CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11595
11596           CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11597         }
11598
11599       if (which & AARCH64_CHECK_BIC)
11600         {
11601           CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11602                  && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11603
11604           CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11605                  && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11606
11607           CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11608                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11609
11610           CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11611                  && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11612
11613           CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11614
11615           CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11616         }
11617
11618       /* Shifting ones / 8-bit / 64-bit variants only checked
11619          for 'ALL' (MOVI/MVNI).  */
11620       if (which == AARCH64_CHECK_MOV)
11621         {
11622           CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11623                  && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11624
11625           CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11626                  && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11627
11628           CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11629                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11630
11631           CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11632                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11633
11634           CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11635
11636           CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11637                  && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11638         }
11639     }
11640   while (0);
11641
11642   if (immtype == -1)
11643     return false;
11644
11645   if (info)
11646     {
11647       info->element_width = elsize;
11648       info->mvn = emvn != 0;
11649       info->shift = eshift;
11650
11651       unsigned HOST_WIDE_INT imm = 0;
11652
11653       if (immtype >= 12 && immtype <= 15)
11654         info->msl = true;
11655
11656       /* Un-invert bytes of recognized vector, if necessary.  */
11657       if (invmask != 0)
11658         for (i = 0; i < idx; i++)
11659           bytes[i] ^= invmask;
11660
11661       if (immtype == 17)
11662         {
11663           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11664           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11665
11666           for (i = 0; i < 8; i++)
11667             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11668               << (i * BITS_PER_UNIT);
11669
11670
11671           info->value = GEN_INT (imm);
11672         }
11673       else
11674         {
11675           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11676             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11677
11678           /* Construct 'abcdefgh' because the assembler cannot handle
11679              generic constants.  */
11680           if (info->mvn)
11681             imm = ~imm;
11682           imm = (imm >> info->shift) & 0xff;
11683           info->value = GEN_INT (imm);
11684         }
11685     }
11686
11687   return true;
11688 #undef CHECK
11689 }
11690
11691 /* Check of immediate shift constants are within range.  */
11692 bool
11693 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11694 {
11695   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11696   if (left)
11697     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11698   else
11699     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11700 }
11701
11702 /* Return true if X is a uniform vector where all elements
11703    are either the floating-point constant 0.0 or the
11704    integer constant 0.  */
11705 bool
11706 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11707 {
11708   return x == CONST0_RTX (mode);
11709 }
11710
11711
11712 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11713    operation of width WIDTH at bit position POS.  */
11714
11715 rtx
11716 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11717 {
11718   gcc_assert (CONST_INT_P (width));
11719   gcc_assert (CONST_INT_P (pos));
11720
11721   unsigned HOST_WIDE_INT mask
11722     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11723   return GEN_INT (mask << UINTVAL (pos));
11724 }
11725
11726 bool
11727 aarch64_mov_operand_p (rtx x, machine_mode mode)
11728 {
11729   if (GET_CODE (x) == HIGH
11730       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11731     return true;
11732
11733   if (CONST_INT_P (x))
11734     return true;
11735
11736   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11737     return true;
11738
11739   return aarch64_classify_symbolic_expression (x)
11740     == SYMBOL_TINY_ABSOLUTE;
11741 }
11742
11743 /* Return a const_int vector of VAL.  */
11744 rtx
11745 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11746 {
11747   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
11748   return gen_const_vec_duplicate (mode, c);
11749 }
11750
11751 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11752
11753 bool
11754 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11755 {
11756   machine_mode vmode;
11757
11758   vmode = aarch64_preferred_simd_mode (mode);
11759   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11760   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11761 }
11762
11763 /* Construct and return a PARALLEL RTX vector with elements numbering the
11764    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11765    the vector - from the perspective of the architecture.  This does not
11766    line up with GCC's perspective on lane numbers, so we end up with
11767    different masks depending on our target endian-ness.  The diagram
11768    below may help.  We must draw the distinction when building masks
11769    which select one half of the vector.  An instruction selecting
11770    architectural low-lanes for a big-endian target, must be described using
11771    a mask selecting GCC high-lanes.
11772
11773                  Big-Endian             Little-Endian
11774
11775 GCC             0   1   2   3           3   2   1   0
11776               | x | x | x | x |       | x | x | x | x |
11777 Architecture    3   2   1   0           3   2   1   0
11778
11779 Low Mask:         { 2, 3 }                { 0, 1 }
11780 High Mask:        { 0, 1 }                { 2, 3 }
11781
11782    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
11783
11784 rtx
11785 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
11786 {
11787   rtvec v = rtvec_alloc (nunits / 2);
11788   int high_base = nunits / 2;
11789   int low_base = 0;
11790   int base;
11791   rtx t1;
11792   int i;
11793
11794   if (BYTES_BIG_ENDIAN)
11795     base = high ? low_base : high_base;
11796   else
11797     base = high ? high_base : low_base;
11798
11799   for (i = 0; i < nunits / 2; i++)
11800     RTVEC_ELT (v, i) = GEN_INT (base + i);
11801
11802   t1 = gen_rtx_PARALLEL (mode, v);
11803   return t1;
11804 }
11805
11806 /* Check OP for validity as a PARALLEL RTX vector with elements
11807    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11808    from the perspective of the architecture.  See the diagram above
11809    aarch64_simd_vect_par_cnst_half for more details.  */
11810
11811 bool
11812 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11813                                        bool high)
11814 {
11815   if (!VECTOR_MODE_P (mode))
11816     return false;
11817
11818   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, GET_MODE_NUNITS (mode),
11819                                                high);
11820   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11821   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11822   int i = 0;
11823
11824   if (count_op != count_ideal)
11825     return false;
11826
11827   for (i = 0; i < count_ideal; i++)
11828     {
11829       rtx elt_op = XVECEXP (op, 0, i);
11830       rtx elt_ideal = XVECEXP (ideal, 0, i);
11831
11832       if (!CONST_INT_P (elt_op)
11833           || INTVAL (elt_ideal) != INTVAL (elt_op))
11834         return false;
11835     }
11836   return true;
11837 }
11838
11839 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11840    HIGH (exclusive).  */
11841 void
11842 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11843                           const_tree exp)
11844 {
11845   HOST_WIDE_INT lane;
11846   gcc_assert (CONST_INT_P (operand));
11847   lane = INTVAL (operand);
11848
11849   if (lane < low || lane >= high)
11850   {
11851     if (exp)
11852       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11853     else
11854       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11855   }
11856 }
11857
11858 /* Peform endian correction on lane number N, which indexes a vector
11859    of mode MODE, and return the result as an SImode rtx.  */
11860
11861 rtx
11862 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
11863 {
11864   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
11865 }
11866
11867 /* Return TRUE if OP is a valid vector addressing mode.  */
11868 bool
11869 aarch64_simd_mem_operand_p (rtx op)
11870 {
11871   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11872                         || REG_P (XEXP (op, 0)));
11873 }
11874
11875 /* Emit a register copy from operand to operand, taking care not to
11876    early-clobber source registers in the process.
11877
11878    COUNT is the number of components into which the copy needs to be
11879    decomposed.  */
11880 void
11881 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11882                                 unsigned int count)
11883 {
11884   unsigned int i;
11885   int rdest = REGNO (operands[0]);
11886   int rsrc = REGNO (operands[1]);
11887
11888   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11889       || rdest < rsrc)
11890     for (i = 0; i < count; i++)
11891       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11892                       gen_rtx_REG (mode, rsrc + i));
11893   else
11894     for (i = 0; i < count; i++)
11895       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11896                       gen_rtx_REG (mode, rsrc + count - i - 1));
11897 }
11898
11899 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11900    one of VSTRUCT modes: OI, CI, or XI.  */
11901 int
11902 aarch64_simd_attr_length_rglist (machine_mode mode)
11903 {
11904   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11905 }
11906
11907 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11908    alignment of a vector to 128 bits.  */
11909 static HOST_WIDE_INT
11910 aarch64_simd_vector_alignment (const_tree type)
11911 {
11912   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11913   return MIN (align, 128);
11914 }
11915
11916 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11917 static bool
11918 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11919 {
11920   if (is_packed)
11921     return false;
11922
11923   /* We guarantee alignment for vectors up to 128-bits.  */
11924   if (tree_int_cst_compare (TYPE_SIZE (type),
11925                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11926     return false;
11927
11928   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11929   return true;
11930 }
11931
11932 /* Return true if the vector misalignment factor is supported by the
11933    target.  */
11934 static bool
11935 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11936                                              const_tree type, int misalignment,
11937                                              bool is_packed)
11938 {
11939   if (TARGET_SIMD && STRICT_ALIGNMENT)
11940     {
11941       /* Return if movmisalign pattern is not supported for this mode.  */
11942       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11943         return false;
11944
11945       /* Misalignment factor is unknown at compile time.  */
11946       if (misalignment == -1)
11947         return false;
11948     }
11949   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11950                                                       is_packed);
11951 }
11952
11953 /* If VALS is a vector constant that can be loaded into a register
11954    using DUP, generate instructions to do so and return an RTX to
11955    assign to the register.  Otherwise return NULL_RTX.  */
11956 static rtx
11957 aarch64_simd_dup_constant (rtx vals)
11958 {
11959   machine_mode mode = GET_MODE (vals);
11960   machine_mode inner_mode = GET_MODE_INNER (mode);
11961   rtx x;
11962
11963   if (!const_vec_duplicate_p (vals, &x))
11964     return NULL_RTX;
11965
11966   /* We can load this constant by using DUP and a constant in a
11967      single ARM register.  This will be cheaper than a vector
11968      load.  */
11969   x = copy_to_mode_reg (inner_mode, x);
11970   return gen_vec_duplicate (mode, x);
11971 }
11972
11973
11974 /* Generate code to load VALS, which is a PARALLEL containing only
11975    constants (for vec_init) or CONST_VECTOR, efficiently into a
11976    register.  Returns an RTX to copy into the register, or NULL_RTX
11977    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11978 static rtx
11979 aarch64_simd_make_constant (rtx vals)
11980 {
11981   machine_mode mode = GET_MODE (vals);
11982   rtx const_dup;
11983   rtx const_vec = NULL_RTX;
11984   int n_elts = GET_MODE_NUNITS (mode);
11985   int n_const = 0;
11986   int i;
11987
11988   if (GET_CODE (vals) == CONST_VECTOR)
11989     const_vec = vals;
11990   else if (GET_CODE (vals) == PARALLEL)
11991     {
11992       /* A CONST_VECTOR must contain only CONST_INTs and
11993          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11994          Only store valid constants in a CONST_VECTOR.  */
11995       for (i = 0; i < n_elts; ++i)
11996         {
11997           rtx x = XVECEXP (vals, 0, i);
11998           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11999             n_const++;
12000         }
12001       if (n_const == n_elts)
12002         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
12003     }
12004   else
12005     gcc_unreachable ();
12006
12007   if (const_vec != NULL_RTX
12008       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
12009     /* Load using MOVI/MVNI.  */
12010     return const_vec;
12011   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
12012     /* Loaded using DUP.  */
12013     return const_dup;
12014   else if (const_vec != NULL_RTX)
12015     /* Load from constant pool. We can not take advantage of single-cycle
12016        LD1 because we need a PC-relative addressing mode.  */
12017     return const_vec;
12018   else
12019     /* A PARALLEL containing something not valid inside CONST_VECTOR.
12020        We can not construct an initializer.  */
12021     return NULL_RTX;
12022 }
12023
12024 /* Expand a vector initialisation sequence, such that TARGET is
12025    initialised to contain VALS.  */
12026
12027 void
12028 aarch64_expand_vector_init (rtx target, rtx vals)
12029 {
12030   machine_mode mode = GET_MODE (target);
12031   scalar_mode inner_mode = GET_MODE_INNER (mode);
12032   /* The number of vector elements.  */
12033   int n_elts = GET_MODE_NUNITS (mode);
12034   /* The number of vector elements which are not constant.  */
12035   int n_var = 0;
12036   rtx any_const = NULL_RTX;
12037   /* The first element of vals.  */
12038   rtx v0 = XVECEXP (vals, 0, 0);
12039   bool all_same = true;
12040
12041   /* Count the number of variable elements to initialise.  */
12042   for (int i = 0; i < n_elts; ++i)
12043     {
12044       rtx x = XVECEXP (vals, 0, i);
12045       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12046         ++n_var;
12047       else
12048         any_const = x;
12049
12050       all_same &= rtx_equal_p (x, v0);
12051     }
12052
12053   /* No variable elements, hand off to aarch64_simd_make_constant which knows
12054      how best to handle this.  */
12055   if (n_var == 0)
12056     {
12057       rtx constant = aarch64_simd_make_constant (vals);
12058       if (constant != NULL_RTX)
12059         {
12060           emit_move_insn (target, constant);
12061           return;
12062         }
12063     }
12064
12065   /* Splat a single non-constant element if we can.  */
12066   if (all_same)
12067     {
12068       rtx x = copy_to_mode_reg (inner_mode, v0);
12069       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
12070       return;
12071     }
12072
12073   enum insn_code icode = optab_handler (vec_set_optab, mode);
12074   gcc_assert (icode != CODE_FOR_nothing);
12075
12076   /* If there are only variable elements, try to optimize
12077      the insertion using dup for the most common element
12078      followed by insertions.  */
12079
12080   /* The algorithm will fill matches[*][0] with the earliest matching element,
12081      and matches[X][1] with the count of duplicate elements (if X is the
12082      earliest element which has duplicates).  */
12083
12084   if (n_var == n_elts && n_elts <= 16)
12085     {
12086       int matches[16][2] = {0};
12087       for (int i = 0; i < n_elts; i++)
12088         {
12089           for (int j = 0; j <= i; j++)
12090             {
12091               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12092                 {
12093                   matches[i][0] = j;
12094                   matches[j][1]++;
12095                   break;
12096                 }
12097             }
12098         }
12099       int maxelement = 0;
12100       int maxv = 0;
12101       for (int i = 0; i < n_elts; i++)
12102         if (matches[i][1] > maxv)
12103           {
12104             maxelement = i;
12105             maxv = matches[i][1];
12106           }
12107
12108       /* Create a duplicate of the most common element.  */
12109       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12110       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
12111
12112       /* Insert the rest.  */
12113       for (int i = 0; i < n_elts; i++)
12114         {
12115           rtx x = XVECEXP (vals, 0, i);
12116           if (matches[i][0] == maxelement)
12117             continue;
12118           x = copy_to_mode_reg (inner_mode, x);
12119           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12120         }
12121       return;
12122     }
12123
12124   /* Initialise a vector which is part-variable.  We want to first try
12125      to build those lanes which are constant in the most efficient way we
12126      can.  */
12127   if (n_var != n_elts)
12128     {
12129       rtx copy = copy_rtx (vals);
12130
12131       /* Load constant part of vector.  We really don't care what goes into the
12132          parts we will overwrite, but we're more likely to be able to load the
12133          constant efficiently if it has fewer, larger, repeating parts
12134          (see aarch64_simd_valid_immediate).  */
12135       for (int i = 0; i < n_elts; i++)
12136         {
12137           rtx x = XVECEXP (vals, 0, i);
12138           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12139             continue;
12140           rtx subst = any_const;
12141           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12142             {
12143               /* Look in the copied vector, as more elements are const.  */
12144               rtx test = XVECEXP (copy, 0, i ^ bit);
12145               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12146                 {
12147                   subst = test;
12148                   break;
12149                 }
12150             }
12151           XVECEXP (copy, 0, i) = subst;
12152         }
12153       aarch64_expand_vector_init (target, copy);
12154     }
12155
12156   /* Insert the variable lanes directly.  */
12157   for (int i = 0; i < n_elts; i++)
12158     {
12159       rtx x = XVECEXP (vals, 0, i);
12160       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12161         continue;
12162       x = copy_to_mode_reg (inner_mode, x);
12163       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12164     }
12165 }
12166
12167 static unsigned HOST_WIDE_INT
12168 aarch64_shift_truncation_mask (machine_mode mode)
12169 {
12170   return
12171     (!SHIFT_COUNT_TRUNCATED
12172      || aarch64_vector_mode_supported_p (mode)
12173      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12174 }
12175
12176 /* Select a format to encode pointers in exception handling data.  */
12177 int
12178 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12179 {
12180    int type;
12181    switch (aarch64_cmodel)
12182      {
12183      case AARCH64_CMODEL_TINY:
12184      case AARCH64_CMODEL_TINY_PIC:
12185      case AARCH64_CMODEL_SMALL:
12186      case AARCH64_CMODEL_SMALL_PIC:
12187      case AARCH64_CMODEL_SMALL_SPIC:
12188        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12189           for everything.  */
12190        type = DW_EH_PE_sdata4;
12191        break;
12192      default:
12193        /* No assumptions here.  8-byte relocs required.  */
12194        type = DW_EH_PE_sdata8;
12195        break;
12196      }
12197    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12198 }
12199
12200 /* The last .arch and .tune assembly strings that we printed.  */
12201 static std::string aarch64_last_printed_arch_string;
12202 static std::string aarch64_last_printed_tune_string;
12203
12204 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12205    by the function fndecl.  */
12206
12207 void
12208 aarch64_declare_function_name (FILE *stream, const char* name,
12209                                 tree fndecl)
12210 {
12211   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12212
12213   struct cl_target_option *targ_options;
12214   if (target_parts)
12215     targ_options = TREE_TARGET_OPTION (target_parts);
12216   else
12217     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12218   gcc_assert (targ_options);
12219
12220   const struct processor *this_arch
12221     = aarch64_get_arch (targ_options->x_explicit_arch);
12222
12223   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12224   std::string extension
12225     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12226                                                   this_arch->flags);
12227   /* Only update the assembler .arch string if it is distinct from the last
12228      such string we printed.  */
12229   std::string to_print = this_arch->name + extension;
12230   if (to_print != aarch64_last_printed_arch_string)
12231     {
12232       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12233       aarch64_last_printed_arch_string = to_print;
12234     }
12235
12236   /* Print the cpu name we're tuning for in the comments, might be
12237      useful to readers of the generated asm.  Do it only when it changes
12238      from function to function and verbose assembly is requested.  */
12239   const struct processor *this_tune
12240     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12241
12242   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12243     {
12244       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12245                    this_tune->name);
12246       aarch64_last_printed_tune_string = this_tune->name;
12247     }
12248
12249   /* Don't forget the type directive for ELF.  */
12250   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12251   ASM_OUTPUT_LABEL (stream, name);
12252 }
12253
12254 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12255
12256 static void
12257 aarch64_start_file (void)
12258 {
12259   struct cl_target_option *default_options
12260     = TREE_TARGET_OPTION (target_option_default_node);
12261
12262   const struct processor *default_arch
12263     = aarch64_get_arch (default_options->x_explicit_arch);
12264   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12265   std::string extension
12266     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12267                                                   default_arch->flags);
12268
12269    aarch64_last_printed_arch_string = default_arch->name + extension;
12270    aarch64_last_printed_tune_string = "";
12271    asm_fprintf (asm_out_file, "\t.arch %s\n",
12272                 aarch64_last_printed_arch_string.c_str ());
12273
12274    default_file_start ();
12275 }
12276
12277 /* Emit load exclusive.  */
12278
12279 static void
12280 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12281                              rtx mem, rtx model_rtx)
12282 {
12283   rtx (*gen) (rtx, rtx, rtx);
12284
12285   switch (mode)
12286     {
12287     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12288     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12289     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12290     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12291     default:
12292       gcc_unreachable ();
12293     }
12294
12295   emit_insn (gen (rval, mem, model_rtx));
12296 }
12297
12298 /* Emit store exclusive.  */
12299
12300 static void
12301 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12302                               rtx rval, rtx mem, rtx model_rtx)
12303 {
12304   rtx (*gen) (rtx, rtx, rtx, rtx);
12305
12306   switch (mode)
12307     {
12308     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12309     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12310     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12311     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12312     default:
12313       gcc_unreachable ();
12314     }
12315
12316   emit_insn (gen (bval, rval, mem, model_rtx));
12317 }
12318
12319 /* Mark the previous jump instruction as unlikely.  */
12320
12321 static void
12322 aarch64_emit_unlikely_jump (rtx insn)
12323 {
12324   rtx_insn *jump = emit_jump_insn (insn);
12325   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12326 }
12327
12328 /* Expand a compare and swap pattern.  */
12329
12330 void
12331 aarch64_expand_compare_and_swap (rtx operands[])
12332 {
12333   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12334   machine_mode mode, cmp_mode;
12335   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12336   int idx;
12337   gen_cas_fn gen;
12338   const gen_cas_fn split_cas[] =
12339   {
12340     gen_aarch64_compare_and_swapqi,
12341     gen_aarch64_compare_and_swaphi,
12342     gen_aarch64_compare_and_swapsi,
12343     gen_aarch64_compare_and_swapdi
12344   };
12345   const gen_cas_fn atomic_cas[] =
12346   {
12347     gen_aarch64_compare_and_swapqi_lse,
12348     gen_aarch64_compare_and_swaphi_lse,
12349     gen_aarch64_compare_and_swapsi_lse,
12350     gen_aarch64_compare_and_swapdi_lse
12351   };
12352
12353   bval = operands[0];
12354   rval = operands[1];
12355   mem = operands[2];
12356   oldval = operands[3];
12357   newval = operands[4];
12358   is_weak = operands[5];
12359   mod_s = operands[6];
12360   mod_f = operands[7];
12361   mode = GET_MODE (mem);
12362   cmp_mode = mode;
12363
12364   /* Normally the succ memory model must be stronger than fail, but in the
12365      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12366      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12367
12368   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12369       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12370     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12371
12372   switch (mode)
12373     {
12374     case E_QImode:
12375     case E_HImode:
12376       /* For short modes, we're going to perform the comparison in SImode,
12377          so do the zero-extension now.  */
12378       cmp_mode = SImode;
12379       rval = gen_reg_rtx (SImode);
12380       oldval = convert_modes (SImode, mode, oldval, true);
12381       /* Fall through.  */
12382
12383     case E_SImode:
12384     case E_DImode:
12385       /* Force the value into a register if needed.  */
12386       if (!aarch64_plus_operand (oldval, mode))
12387         oldval = force_reg (cmp_mode, oldval);
12388       break;
12389
12390     default:
12391       gcc_unreachable ();
12392     }
12393
12394   switch (mode)
12395     {
12396     case E_QImode: idx = 0; break;
12397     case E_HImode: idx = 1; break;
12398     case E_SImode: idx = 2; break;
12399     case E_DImode: idx = 3; break;
12400     default:
12401       gcc_unreachable ();
12402     }
12403   if (TARGET_LSE)
12404     gen = atomic_cas[idx];
12405   else
12406     gen = split_cas[idx];
12407
12408   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12409
12410   if (mode == QImode || mode == HImode)
12411     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12412
12413   x = gen_rtx_REG (CCmode, CC_REGNUM);
12414   x = gen_rtx_EQ (SImode, x, const0_rtx);
12415   emit_insn (gen_rtx_SET (bval, x));
12416 }
12417
12418 /* Test whether the target supports using a atomic load-operate instruction.
12419    CODE is the operation and AFTER is TRUE if the data in memory after the
12420    operation should be returned and FALSE if the data before the operation
12421    should be returned.  Returns FALSE if the operation isn't supported by the
12422    architecture.  */
12423
12424 bool
12425 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12426 {
12427   if (!TARGET_LSE)
12428     return false;
12429
12430   switch (code)
12431     {
12432     case SET:
12433     case AND:
12434     case IOR:
12435     case XOR:
12436     case MINUS:
12437     case PLUS:
12438       return true;
12439     default:
12440       return false;
12441     }
12442 }
12443
12444 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12445    sequence implementing an atomic operation.  */
12446
12447 static void
12448 aarch64_emit_post_barrier (enum memmodel model)
12449 {
12450   const enum memmodel base_model = memmodel_base (model);
12451
12452   if (is_mm_sync (model)
12453       && (base_model == MEMMODEL_ACQUIRE
12454           || base_model == MEMMODEL_ACQ_REL
12455           || base_model == MEMMODEL_SEQ_CST))
12456     {
12457       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12458     }
12459 }
12460
12461 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12462    for the data in memory.  EXPECTED is the value expected to be in memory.
12463    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12464    is the memory ordering to use.  */
12465
12466 void
12467 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12468                         rtx expected, rtx desired,
12469                         rtx model)
12470 {
12471   rtx (*gen) (rtx, rtx, rtx, rtx);
12472   machine_mode mode;
12473
12474   mode = GET_MODE (mem);
12475
12476   switch (mode)
12477     {
12478     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12479     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12480     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12481     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12482     default:
12483       gcc_unreachable ();
12484     }
12485
12486   /* Move the expected value into the CAS destination register.  */
12487   emit_insn (gen_rtx_SET (rval, expected));
12488
12489   /* Emit the CAS.  */
12490   emit_insn (gen (rval, mem, desired, model));
12491
12492   /* Compare the expected value with the value loaded by the CAS, to establish
12493      whether the swap was made.  */
12494   aarch64_gen_compare_reg (EQ, rval, expected);
12495 }
12496
12497 /* Split a compare and swap pattern.  */
12498
12499 void
12500 aarch64_split_compare_and_swap (rtx operands[])
12501 {
12502   rtx rval, mem, oldval, newval, scratch;
12503   machine_mode mode;
12504   bool is_weak;
12505   rtx_code_label *label1, *label2;
12506   rtx x, cond;
12507   enum memmodel model;
12508   rtx model_rtx;
12509
12510   rval = operands[0];
12511   mem = operands[1];
12512   oldval = operands[2];
12513   newval = operands[3];
12514   is_weak = (operands[4] != const0_rtx);
12515   model_rtx = operands[5];
12516   scratch = operands[7];
12517   mode = GET_MODE (mem);
12518   model = memmodel_from_int (INTVAL (model_rtx));
12519
12520   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12521     loop:
12522     .label1:
12523         LD[A]XR rval, [mem]
12524         CBNZ    rval, .label2
12525         ST[L]XR scratch, newval, [mem]
12526         CBNZ    scratch, .label1
12527     .label2:
12528         CMP     rval, 0.  */
12529   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12530
12531   label1 = NULL;
12532   if (!is_weak)
12533     {
12534       label1 = gen_label_rtx ();
12535       emit_label (label1);
12536     }
12537   label2 = gen_label_rtx ();
12538
12539   /* The initial load can be relaxed for a __sync operation since a final
12540      barrier will be emitted to stop code hoisting.  */
12541   if (is_mm_sync (model))
12542     aarch64_emit_load_exclusive (mode, rval, mem,
12543                                  GEN_INT (MEMMODEL_RELAXED));
12544   else
12545     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12546
12547   if (strong_zero_p)
12548     {
12549       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12550       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12551                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12552       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12553     }
12554   else
12555     {
12556       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12557       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12558       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12559                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12560       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12561     }
12562
12563   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12564
12565   if (!is_weak)
12566     {
12567       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12568       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12569                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12570       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12571     }
12572   else
12573     {
12574       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12575       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12576       emit_insn (gen_rtx_SET (cond, x));
12577     }
12578
12579   emit_label (label2);
12580   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12581      to set the condition flags.  If this is not used it will be removed by
12582      later passes.  */
12583   if (strong_zero_p)
12584     {
12585       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12586       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12587       emit_insn (gen_rtx_SET (cond, x));
12588     }
12589   /* Emit any final barrier needed for a __sync operation.  */
12590   if (is_mm_sync (model))
12591     aarch64_emit_post_barrier (model);
12592 }
12593
12594 /* Emit a BIC instruction.  */
12595
12596 static void
12597 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12598 {
12599   rtx shift_rtx = GEN_INT (shift);
12600   rtx (*gen) (rtx, rtx, rtx, rtx);
12601
12602   switch (mode)
12603     {
12604     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12605     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12606     default:
12607       gcc_unreachable ();
12608     }
12609
12610   emit_insn (gen (dst, s2, shift_rtx, s1));
12611 }
12612
12613 /* Emit an atomic swap.  */
12614
12615 static void
12616 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12617                           rtx mem, rtx model)
12618 {
12619   rtx (*gen) (rtx, rtx, rtx, rtx);
12620
12621   switch (mode)
12622     {
12623     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12624     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12625     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12626     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12627     default:
12628       gcc_unreachable ();
12629     }
12630
12631   emit_insn (gen (dst, mem, value, model));
12632 }
12633
12634 /* Operations supported by aarch64_emit_atomic_load_op.  */
12635
12636 enum aarch64_atomic_load_op_code
12637 {
12638   AARCH64_LDOP_PLUS,    /* A + B  */
12639   AARCH64_LDOP_XOR,     /* A ^ B  */
12640   AARCH64_LDOP_OR,      /* A | B  */
12641   AARCH64_LDOP_BIC      /* A & ~B  */
12642 };
12643
12644 /* Emit an atomic load-operate.  */
12645
12646 static void
12647 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12648                              machine_mode mode, rtx dst, rtx src,
12649                              rtx mem, rtx model)
12650 {
12651   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12652   const aarch64_atomic_load_op_fn plus[] =
12653   {
12654     gen_aarch64_atomic_loadaddqi,
12655     gen_aarch64_atomic_loadaddhi,
12656     gen_aarch64_atomic_loadaddsi,
12657     gen_aarch64_atomic_loadadddi
12658   };
12659   const aarch64_atomic_load_op_fn eor[] =
12660   {
12661     gen_aarch64_atomic_loadeorqi,
12662     gen_aarch64_atomic_loadeorhi,
12663     gen_aarch64_atomic_loadeorsi,
12664     gen_aarch64_atomic_loadeordi
12665   };
12666   const aarch64_atomic_load_op_fn ior[] =
12667   {
12668     gen_aarch64_atomic_loadsetqi,
12669     gen_aarch64_atomic_loadsethi,
12670     gen_aarch64_atomic_loadsetsi,
12671     gen_aarch64_atomic_loadsetdi
12672   };
12673   const aarch64_atomic_load_op_fn bic[] =
12674   {
12675     gen_aarch64_atomic_loadclrqi,
12676     gen_aarch64_atomic_loadclrhi,
12677     gen_aarch64_atomic_loadclrsi,
12678     gen_aarch64_atomic_loadclrdi
12679   };
12680   aarch64_atomic_load_op_fn gen;
12681   int idx = 0;
12682
12683   switch (mode)
12684     {
12685     case E_QImode: idx = 0; break;
12686     case E_HImode: idx = 1; break;
12687     case E_SImode: idx = 2; break;
12688     case E_DImode: idx = 3; break;
12689     default:
12690       gcc_unreachable ();
12691     }
12692
12693   switch (code)
12694     {
12695     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12696     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12697     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12698     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12699     default:
12700       gcc_unreachable ();
12701     }
12702
12703   emit_insn (gen (dst, mem, src, model));
12704 }
12705
12706 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12707    location to store the data read from memory.  OUT_RESULT is the location to
12708    store the result of the operation.  MEM is the memory location to read and
12709    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12710    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12711    be NULL.  */
12712
12713 void
12714 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12715                          rtx mem, rtx value, rtx model_rtx)
12716 {
12717   machine_mode mode = GET_MODE (mem);
12718   machine_mode wmode = (mode == DImode ? DImode : SImode);
12719   const bool short_mode = (mode < SImode);
12720   aarch64_atomic_load_op_code ldop_code;
12721   rtx src;
12722   rtx x;
12723
12724   if (out_data)
12725     out_data = gen_lowpart (mode, out_data);
12726
12727   if (out_result)
12728     out_result = gen_lowpart (mode, out_result);
12729
12730   /* Make sure the value is in a register, putting it into a destination
12731      register if it needs to be manipulated.  */
12732   if (!register_operand (value, mode)
12733       || code == AND || code == MINUS)
12734     {
12735       src = out_result ? out_result : out_data;
12736       emit_move_insn (src, gen_lowpart (mode, value));
12737     }
12738   else
12739     src = value;
12740   gcc_assert (register_operand (src, mode));
12741
12742   /* Preprocess the data for the operation as necessary.  If the operation is
12743      a SET then emit a swap instruction and finish.  */
12744   switch (code)
12745     {
12746     case SET:
12747       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12748       return;
12749
12750     case MINUS:
12751       /* Negate the value and treat it as a PLUS.  */
12752       {
12753         rtx neg_src;
12754
12755         /* Resize the value if necessary.  */
12756         if (short_mode)
12757           src = gen_lowpart (wmode, src);
12758
12759         neg_src = gen_rtx_NEG (wmode, src);
12760         emit_insn (gen_rtx_SET (src, neg_src));
12761
12762         if (short_mode)
12763           src = gen_lowpart (mode, src);
12764       }
12765       /* Fall-through.  */
12766     case PLUS:
12767       ldop_code = AARCH64_LDOP_PLUS;
12768       break;
12769
12770     case IOR:
12771       ldop_code = AARCH64_LDOP_OR;
12772       break;
12773
12774     case XOR:
12775       ldop_code = AARCH64_LDOP_XOR;
12776       break;
12777
12778     case AND:
12779       {
12780         rtx not_src;
12781
12782         /* Resize the value if necessary.  */
12783         if (short_mode)
12784           src = gen_lowpart (wmode, src);
12785
12786         not_src = gen_rtx_NOT (wmode, src);
12787         emit_insn (gen_rtx_SET (src, not_src));
12788
12789         if (short_mode)
12790           src = gen_lowpart (mode, src);
12791       }
12792       ldop_code = AARCH64_LDOP_BIC;
12793       break;
12794
12795     default:
12796       /* The operation can't be done with atomic instructions.  */
12797       gcc_unreachable ();
12798     }
12799
12800   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12801
12802   /* If necessary, calculate the data in memory after the update by redoing the
12803      operation from values in registers.  */
12804   if (!out_result)
12805     return;
12806
12807   if (short_mode)
12808     {
12809       src = gen_lowpart (wmode, src);
12810       out_data = gen_lowpart (wmode, out_data);
12811       out_result = gen_lowpart (wmode, out_result);
12812     }
12813
12814   x = NULL_RTX;
12815
12816   switch (code)
12817     {
12818     case MINUS:
12819     case PLUS:
12820       x = gen_rtx_PLUS (wmode, out_data, src);
12821       break;
12822     case IOR:
12823       x = gen_rtx_IOR (wmode, out_data, src);
12824       break;
12825     case XOR:
12826       x = gen_rtx_XOR (wmode, out_data, src);
12827       break;
12828     case AND:
12829       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12830       return;
12831     default:
12832       gcc_unreachable ();
12833     }
12834
12835   emit_set_insn (out_result, x);
12836
12837   return;
12838 }
12839
12840 /* Split an atomic operation.  */
12841
12842 void
12843 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12844                          rtx value, rtx model_rtx, rtx cond)
12845 {
12846   machine_mode mode = GET_MODE (mem);
12847   machine_mode wmode = (mode == DImode ? DImode : SImode);
12848   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12849   const bool is_sync = is_mm_sync (model);
12850   rtx_code_label *label;
12851   rtx x;
12852
12853   /* Split the atomic operation into a sequence.  */
12854   label = gen_label_rtx ();
12855   emit_label (label);
12856
12857   if (new_out)
12858     new_out = gen_lowpart (wmode, new_out);
12859   if (old_out)
12860     old_out = gen_lowpart (wmode, old_out);
12861   else
12862     old_out = new_out;
12863   value = simplify_gen_subreg (wmode, value, mode, 0);
12864
12865   /* The initial load can be relaxed for a __sync operation since a final
12866      barrier will be emitted to stop code hoisting.  */
12867  if (is_sync)
12868     aarch64_emit_load_exclusive (mode, old_out, mem,
12869                                  GEN_INT (MEMMODEL_RELAXED));
12870   else
12871     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12872
12873   switch (code)
12874     {
12875     case SET:
12876       new_out = value;
12877       break;
12878
12879     case NOT:
12880       x = gen_rtx_AND (wmode, old_out, value);
12881       emit_insn (gen_rtx_SET (new_out, x));
12882       x = gen_rtx_NOT (wmode, new_out);
12883       emit_insn (gen_rtx_SET (new_out, x));
12884       break;
12885
12886     case MINUS:
12887       if (CONST_INT_P (value))
12888         {
12889           value = GEN_INT (-INTVAL (value));
12890           code = PLUS;
12891         }
12892       /* Fall through.  */
12893
12894     default:
12895       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12896       emit_insn (gen_rtx_SET (new_out, x));
12897       break;
12898     }
12899
12900   aarch64_emit_store_exclusive (mode, cond, mem,
12901                                 gen_lowpart (mode, new_out), model_rtx);
12902
12903   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12904   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12905                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12906   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12907
12908   /* Emit any final barrier needed for a __sync operation.  */
12909   if (is_sync)
12910     aarch64_emit_post_barrier (model);
12911 }
12912
12913 static void
12914 aarch64_init_libfuncs (void)
12915 {
12916    /* Half-precision float operations.  The compiler handles all operations
12917      with NULL libfuncs by converting to SFmode.  */
12918
12919   /* Conversions.  */
12920   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12921   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12922
12923   /* Arithmetic.  */
12924   set_optab_libfunc (add_optab, HFmode, NULL);
12925   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12926   set_optab_libfunc (smul_optab, HFmode, NULL);
12927   set_optab_libfunc (neg_optab, HFmode, NULL);
12928   set_optab_libfunc (sub_optab, HFmode, NULL);
12929
12930   /* Comparisons.  */
12931   set_optab_libfunc (eq_optab, HFmode, NULL);
12932   set_optab_libfunc (ne_optab, HFmode, NULL);
12933   set_optab_libfunc (lt_optab, HFmode, NULL);
12934   set_optab_libfunc (le_optab, HFmode, NULL);
12935   set_optab_libfunc (ge_optab, HFmode, NULL);
12936   set_optab_libfunc (gt_optab, HFmode, NULL);
12937   set_optab_libfunc (unord_optab, HFmode, NULL);
12938 }
12939
12940 /* Target hook for c_mode_for_suffix.  */
12941 static machine_mode
12942 aarch64_c_mode_for_suffix (char suffix)
12943 {
12944   if (suffix == 'q')
12945     return TFmode;
12946
12947   return VOIDmode;
12948 }
12949
12950 /* We can only represent floating point constants which will fit in
12951    "quarter-precision" values.  These values are characterised by
12952    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12953    by:
12954
12955    (-1)^s * (n/16) * 2^r
12956
12957    Where:
12958      's' is the sign bit.
12959      'n' is an integer in the range 16 <= n <= 31.
12960      'r' is an integer in the range -3 <= r <= 4.  */
12961
12962 /* Return true iff X can be represented by a quarter-precision
12963    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12964 bool
12965 aarch64_float_const_representable_p (rtx x)
12966 {
12967   /* This represents our current view of how many bits
12968      make up the mantissa.  */
12969   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12970   int exponent;
12971   unsigned HOST_WIDE_INT mantissa, mask;
12972   REAL_VALUE_TYPE r, m;
12973   bool fail;
12974
12975   if (!CONST_DOUBLE_P (x))
12976     return false;
12977
12978   /* We don't support HFmode constants yet.  */
12979   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12980     return false;
12981
12982   r = *CONST_DOUBLE_REAL_VALUE (x);
12983
12984   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12985      know if we have +zero until we analyse the mantissa, but we
12986      can reject the other invalid values.  */
12987   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12988       || REAL_VALUE_MINUS_ZERO (r))
12989     return false;
12990
12991   /* Extract exponent.  */
12992   r = real_value_abs (&r);
12993   exponent = REAL_EXP (&r);
12994
12995   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12996      highest (sign) bit, with a fixed binary point at bit point_pos.
12997      m1 holds the low part of the mantissa, m2 the high part.
12998      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12999      bits for the mantissa, this can fail (low bits will be lost).  */
13000   real_ldexp (&m, &r, point_pos - exponent);
13001   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
13002
13003   /* If the low part of the mantissa has bits set we cannot represent
13004      the value.  */
13005   if (w.ulow () != 0)
13006     return false;
13007   /* We have rejected the lower HOST_WIDE_INT, so update our
13008      understanding of how many bits lie in the mantissa and
13009      look only at the high HOST_WIDE_INT.  */
13010   mantissa = w.elt (1);
13011   point_pos -= HOST_BITS_PER_WIDE_INT;
13012
13013   /* We can only represent values with a mantissa of the form 1.xxxx.  */
13014   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
13015   if ((mantissa & mask) != 0)
13016     return false;
13017
13018   /* Having filtered unrepresentable values, we may now remove all
13019      but the highest 5 bits.  */
13020   mantissa >>= point_pos - 5;
13021
13022   /* We cannot represent the value 0.0, so reject it.  This is handled
13023      elsewhere.  */
13024   if (mantissa == 0)
13025     return false;
13026
13027   /* Then, as bit 4 is always set, we can mask it off, leaving
13028      the mantissa in the range [0, 15].  */
13029   mantissa &= ~(1 << 4);
13030   gcc_assert (mantissa <= 15);
13031
13032   /* GCC internally does not use IEEE754-like encoding (where normalized
13033      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
13034      Our mantissa values are shifted 4 places to the left relative to
13035      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
13036      by 5 places to correct for GCC's representation.  */
13037   exponent = 5 - exponent;
13038
13039   return (exponent >= 0 && exponent <= 7);
13040 }
13041
13042 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
13043    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
13044    output MOVI/MVNI, ORR or BIC immediate.  */
13045 char*
13046 aarch64_output_simd_mov_immediate (rtx const_vector,
13047                                    machine_mode mode,
13048                                    unsigned width,
13049                                    enum simd_immediate_check which)
13050 {
13051   bool is_valid;
13052   static char templ[40];
13053   const char *mnemonic;
13054   const char *shift_op;
13055   unsigned int lane_count = 0;
13056   char element_char;
13057
13058   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13059
13060   /* This will return true to show const_vector is legal for use as either
13061      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
13062      It will also update INFO to show how the immediate should be generated.
13063      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
13064   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false,
13065                                            &info, which);
13066   gcc_assert (is_valid);
13067
13068   element_char = sizetochar (info.element_width);
13069   lane_count = width / info.element_width;
13070
13071   mode = GET_MODE_INNER (mode);
13072   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13073     {
13074       gcc_assert (info.shift == 0 && ! info.mvn);
13075       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13076          move immediate path.  */
13077       if (aarch64_float_const_zero_rtx_p (info.value))
13078         info.value = GEN_INT (0);
13079       else
13080         {
13081           const unsigned int buf_size = 20;
13082           char float_buf[buf_size] = {'\0'};
13083           real_to_decimal_for_mode (float_buf,
13084                                     CONST_DOUBLE_REAL_VALUE (info.value),
13085                                     buf_size, buf_size, 1, mode);
13086
13087           if (lane_count == 1)
13088             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13089           else
13090             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13091                       lane_count, element_char, float_buf);
13092           return templ;
13093         }
13094     }
13095
13096   gcc_assert (CONST_INT_P (info.value));
13097
13098   if (which == AARCH64_CHECK_MOV)
13099     {
13100       mnemonic = info.mvn ? "mvni" : "movi";
13101       shift_op = info.msl ? "msl" : "lsl";
13102       if (lane_count == 1)
13103         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13104                   mnemonic, UINTVAL (info.value));
13105       else if (info.shift)
13106         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13107                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
13108                   element_char, UINTVAL (info.value), shift_op, info.shift);
13109       else
13110         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13111                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
13112                   element_char, UINTVAL (info.value));
13113     }
13114   else
13115     {
13116       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
13117       mnemonic = info.mvn ? "bic" : "orr";
13118       if (info.shift)
13119         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13120                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
13121                   element_char, UINTVAL (info.value), "lsl", info.shift);
13122       else
13123         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13124                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
13125                   element_char, UINTVAL (info.value));
13126     }
13127   return templ;
13128 }
13129
13130 char*
13131 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13132 {
13133
13134   /* If a floating point number was passed and we desire to use it in an
13135      integer mode do the conversion to integer.  */
13136   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13137     {
13138       unsigned HOST_WIDE_INT ival;
13139       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13140           gcc_unreachable ();
13141       immediate = gen_int_mode (ival, mode);
13142     }
13143
13144   machine_mode vmode;
13145   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13146      a 128 bit vector mode.  */
13147   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13148
13149   vmode = aarch64_simd_container_mode (mode, width);
13150   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13151   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13152 }
13153
13154 /* Split operands into moves from op[1] + op[2] into op[0].  */
13155
13156 void
13157 aarch64_split_combinev16qi (rtx operands[3])
13158 {
13159   unsigned int dest = REGNO (operands[0]);
13160   unsigned int src1 = REGNO (operands[1]);
13161   unsigned int src2 = REGNO (operands[2]);
13162   machine_mode halfmode = GET_MODE (operands[1]);
13163   unsigned int halfregs = REG_NREGS (operands[1]);
13164   rtx destlo, desthi;
13165
13166   gcc_assert (halfmode == V16QImode);
13167
13168   if (src1 == dest && src2 == dest + halfregs)
13169     {
13170       /* No-op move.  Can't split to nothing; emit something.  */
13171       emit_note (NOTE_INSN_DELETED);
13172       return;
13173     }
13174
13175   /* Preserve register attributes for variable tracking.  */
13176   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13177   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13178                                GET_MODE_SIZE (halfmode));
13179
13180   /* Special case of reversed high/low parts.  */
13181   if (reg_overlap_mentioned_p (operands[2], destlo)
13182       && reg_overlap_mentioned_p (operands[1], desthi))
13183     {
13184       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13185       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13186       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13187     }
13188   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13189     {
13190       /* Try to avoid unnecessary moves if part of the result
13191          is in the right place already.  */
13192       if (src1 != dest)
13193         emit_move_insn (destlo, operands[1]);
13194       if (src2 != dest + halfregs)
13195         emit_move_insn (desthi, operands[2]);
13196     }
13197   else
13198     {
13199       if (src2 != dest + halfregs)
13200         emit_move_insn (desthi, operands[2]);
13201       if (src1 != dest)
13202         emit_move_insn (destlo, operands[1]);
13203     }
13204 }
13205
13206 /* vec_perm support.  */
13207
13208 #define MAX_VECT_LEN 16
13209
13210 struct expand_vec_perm_d
13211 {
13212   rtx target, op0, op1;
13213   auto_vec_perm_indices perm;
13214   machine_mode vmode;
13215   bool one_vector_p;
13216   bool testing_p;
13217 };
13218
13219 /* Generate a variable permutation.  */
13220
13221 static void
13222 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13223 {
13224   machine_mode vmode = GET_MODE (target);
13225   bool one_vector_p = rtx_equal_p (op0, op1);
13226
13227   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13228   gcc_checking_assert (GET_MODE (op0) == vmode);
13229   gcc_checking_assert (GET_MODE (op1) == vmode);
13230   gcc_checking_assert (GET_MODE (sel) == vmode);
13231   gcc_checking_assert (TARGET_SIMD);
13232
13233   if (one_vector_p)
13234     {
13235       if (vmode == V8QImode)
13236         {
13237           /* Expand the argument to a V16QI mode by duplicating it.  */
13238           rtx pair = gen_reg_rtx (V16QImode);
13239           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13240           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13241         }
13242       else
13243         {
13244           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13245         }
13246     }
13247   else
13248     {
13249       rtx pair;
13250
13251       if (vmode == V8QImode)
13252         {
13253           pair = gen_reg_rtx (V16QImode);
13254           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13255           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13256         }
13257       else
13258         {
13259           pair = gen_reg_rtx (OImode);
13260           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13261           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13262         }
13263     }
13264 }
13265
13266 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
13267    NELT is the number of elements in the vector.  */
13268
13269 void
13270 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
13271                          unsigned int nelt)
13272 {
13273   machine_mode vmode = GET_MODE (target);
13274   bool one_vector_p = rtx_equal_p (op0, op1);
13275   rtx mask;
13276
13277   /* The TBL instruction does not use a modulo index, so we must take care
13278      of that ourselves.  */
13279   mask = aarch64_simd_gen_const_vector_dup (vmode,
13280       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13281   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13282
13283   /* For big-endian, we also need to reverse the index within the vector
13284      (but not which vector).  */
13285   if (BYTES_BIG_ENDIAN)
13286     {
13287       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13288       if (!one_vector_p)
13289         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13290       sel = expand_simple_binop (vmode, XOR, sel, mask,
13291                                  NULL, 0, OPTAB_LIB_WIDEN);
13292     }
13293   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13294 }
13295
13296 /* Recognize patterns suitable for the TRN instructions.  */
13297 static bool
13298 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13299 {
13300   unsigned int i, odd, mask, nelt = d->perm.length ();
13301   rtx out, in0, in1, x;
13302   machine_mode vmode = d->vmode;
13303
13304   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13305     return false;
13306
13307   /* Note that these are little-endian tests.
13308      We correct for big-endian later.  */
13309   if (d->perm[0] == 0)
13310     odd = 0;
13311   else if (d->perm[0] == 1)
13312     odd = 1;
13313   else
13314     return false;
13315   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13316
13317   for (i = 0; i < nelt; i += 2)
13318     {
13319       if (d->perm[i] != i + odd)
13320         return false;
13321       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13322         return false;
13323     }
13324
13325   /* Success!  */
13326   if (d->testing_p)
13327     return true;
13328
13329   in0 = d->op0;
13330   in1 = d->op1;
13331   if (BYTES_BIG_ENDIAN)
13332     {
13333       x = in0, in0 = in1, in1 = x;
13334       odd = !odd;
13335     }
13336   out = d->target;
13337
13338   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
13339                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
13340   return true;
13341 }
13342
13343 /* Recognize patterns suitable for the UZP instructions.  */
13344 static bool
13345 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13346 {
13347   unsigned int i, odd, mask, nelt = d->perm.length ();
13348   rtx out, in0, in1, x;
13349   machine_mode vmode = d->vmode;
13350
13351   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13352     return false;
13353
13354   /* Note that these are little-endian tests.
13355      We correct for big-endian later.  */
13356   if (d->perm[0] == 0)
13357     odd = 0;
13358   else if (d->perm[0] == 1)
13359     odd = 1;
13360   else
13361     return false;
13362   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13363
13364   for (i = 0; i < nelt; i++)
13365     {
13366       unsigned elt = (i * 2 + odd) & mask;
13367       if (d->perm[i] != elt)
13368         return false;
13369     }
13370
13371   /* Success!  */
13372   if (d->testing_p)
13373     return true;
13374
13375   in0 = d->op0;
13376   in1 = d->op1;
13377   if (BYTES_BIG_ENDIAN)
13378     {
13379       x = in0, in0 = in1, in1 = x;
13380       odd = !odd;
13381     }
13382   out = d->target;
13383
13384   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
13385                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
13386   return true;
13387 }
13388
13389 /* Recognize patterns suitable for the ZIP instructions.  */
13390 static bool
13391 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13392 {
13393   unsigned int i, high, mask, nelt = d->perm.length ();
13394   rtx out, in0, in1, x;
13395   machine_mode vmode = d->vmode;
13396
13397   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13398     return false;
13399
13400   /* Note that these are little-endian tests.
13401      We correct for big-endian later.  */
13402   high = nelt / 2;
13403   if (d->perm[0] == high)
13404     /* Do Nothing.  */
13405     ;
13406   else if (d->perm[0] == 0)
13407     high = 0;
13408   else
13409     return false;
13410   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13411
13412   for (i = 0; i < nelt / 2; i++)
13413     {
13414       unsigned elt = (i + high) & mask;
13415       if (d->perm[i * 2] != elt)
13416         return false;
13417       elt = (elt + nelt) & mask;
13418       if (d->perm[i * 2 + 1] != elt)
13419         return false;
13420     }
13421
13422   /* Success!  */
13423   if (d->testing_p)
13424     return true;
13425
13426   in0 = d->op0;
13427   in1 = d->op1;
13428   if (BYTES_BIG_ENDIAN)
13429     {
13430       x = in0, in0 = in1, in1 = x;
13431       high = !high;
13432     }
13433   out = d->target;
13434
13435   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
13436                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
13437   return true;
13438 }
13439
13440 /* Recognize patterns for the EXT insn.  */
13441
13442 static bool
13443 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13444 {
13445   unsigned int i, nelt = d->perm.length ();
13446   rtx offset;
13447
13448   unsigned int location = d->perm[0]; /* Always < nelt.  */
13449
13450   /* Check if the extracted indices are increasing by one.  */
13451   for (i = 1; i < nelt; i++)
13452     {
13453       unsigned int required = location + i;
13454       if (d->one_vector_p)
13455         {
13456           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13457           required &= (nelt - 1);
13458         }
13459       if (d->perm[i] != required)
13460         return false;
13461     }
13462
13463   /* Success! */
13464   if (d->testing_p)
13465     return true;
13466
13467   /* The case where (location == 0) is a no-op for both big- and little-endian,
13468      and is removed by the mid-end at optimization levels -O1 and higher.  */
13469
13470   if (BYTES_BIG_ENDIAN && (location != 0))
13471     {
13472       /* After setup, we want the high elements of the first vector (stored
13473          at the LSB end of the register), and the low elements of the second
13474          vector (stored at the MSB end of the register). So swap.  */
13475       std::swap (d->op0, d->op1);
13476       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13477       location = nelt - location;
13478     }
13479
13480   offset = GEN_INT (location);
13481   emit_set_insn (d->target,
13482                  gen_rtx_UNSPEC (d->vmode,
13483                                  gen_rtvec (3, d->op0, d->op1, offset),
13484                                  UNSPEC_EXT));
13485   return true;
13486 }
13487
13488 /* Recognize patterns for the REV insns.  */
13489
13490 static bool
13491 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13492 {
13493   unsigned int i, j, diff, size, unspec, nelt = d->perm.length ();
13494
13495   if (!d->one_vector_p)
13496     return false;
13497
13498   diff = d->perm[0];
13499   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
13500   if (size == 8)
13501     unspec = UNSPEC_REV64;
13502   else if (size == 4)
13503     unspec = UNSPEC_REV32;
13504   else if (size == 2)
13505     unspec = UNSPEC_REV16;
13506   else
13507     return false;
13508
13509   for (i = 0; i < nelt ; i += diff + 1)
13510     for (j = 0; j <= diff; j += 1)
13511       {
13512         /* This is guaranteed to be true as the value of diff
13513            is 7, 3, 1 and we should have enough elements in the
13514            queue to generate this.  Getting a vector mask with a
13515            value of diff other than these values implies that
13516            something is wrong by the time we get here.  */
13517         gcc_assert (i + j < nelt);
13518         if (d->perm[i + j] != i + diff - j)
13519           return false;
13520       }
13521
13522   /* Success! */
13523   if (d->testing_p)
13524     return true;
13525
13526   emit_set_insn (d->target, gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0),
13527                                             unspec));
13528   return true;
13529 }
13530
13531 static bool
13532 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13533 {
13534   rtx out = d->target;
13535   rtx in0;
13536   machine_mode vmode = d->vmode;
13537   unsigned int i, elt, nelt = d->perm.length ();
13538   rtx lane;
13539
13540   elt = d->perm[0];
13541   for (i = 1; i < nelt; i++)
13542     {
13543       if (elt != d->perm[i])
13544         return false;
13545     }
13546
13547   /* The generic preparation in aarch64_expand_vec_perm_const_1
13548      swaps the operand order and the permute indices if it finds
13549      d->perm[0] to be in the second operand.  Thus, we can always
13550      use d->op0 and need not do any extra arithmetic to get the
13551      correct lane number.  */
13552   in0 = d->op0;
13553   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13554
13555   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
13556   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
13557   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
13558   return true;
13559 }
13560
13561 static bool
13562 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13563 {
13564   rtx rperm[MAX_VECT_LEN], sel;
13565   machine_mode vmode = d->vmode;
13566   unsigned int i, nelt = d->perm.length ();
13567
13568   if (d->testing_p)
13569     return true;
13570
13571   /* Generic code will try constant permutation twice.  Once with the
13572      original mode and again with the elements lowered to QImode.
13573      So wait and don't do the selector expansion ourselves.  */
13574   if (vmode != V8QImode && vmode != V16QImode)
13575     return false;
13576
13577   for (i = 0; i < nelt; ++i)
13578     {
13579       int nunits = GET_MODE_NUNITS (vmode);
13580
13581       /* If big-endian and two vectors we end up with a weird mixed-endian
13582          mode on NEON.  Reverse the index within each word but not the word
13583          itself.  */
13584       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13585                                            : d->perm[i]);
13586     }
13587   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13588   sel = force_reg (vmode, sel);
13589
13590   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13591   return true;
13592 }
13593
13594 static bool
13595 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13596 {
13597   /* The pattern matching functions above are written to look for a small
13598      number to begin the sequence (0, 1, N/2).  If we begin with an index
13599      from the second operand, we can swap the operands.  */
13600   unsigned int nelt = d->perm.length ();
13601   if (d->perm[0] >= nelt)
13602     {
13603       gcc_assert (nelt == (nelt & -nelt));
13604       for (unsigned int i = 0; i < nelt; ++i)
13605         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13606
13607       std::swap (d->op0, d->op1);
13608     }
13609
13610   if (TARGET_SIMD && nelt > 1)
13611     {
13612       if (aarch64_evpc_rev (d))
13613         return true;
13614       else if (aarch64_evpc_ext (d))
13615         return true;
13616       else if (aarch64_evpc_dup (d))
13617         return true;
13618       else if (aarch64_evpc_zip (d))
13619         return true;
13620       else if (aarch64_evpc_uzp (d))
13621         return true;
13622       else if (aarch64_evpc_trn (d))
13623         return true;
13624       return aarch64_evpc_tbl (d);
13625     }
13626   return false;
13627 }
13628
13629 /* Expand a vec_perm_const pattern with the operands given by TARGET,
13630    OP0, OP1 and SEL.  NELT is the number of elements in the vector.  */
13631
13632 bool
13633 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel,
13634                                unsigned int nelt)
13635 {
13636   struct expand_vec_perm_d d;
13637   unsigned int i, which;
13638
13639   d.target = target;
13640   d.op0 = op0;
13641   d.op1 = op1;
13642
13643   d.vmode = GET_MODE (target);
13644   gcc_assert (VECTOR_MODE_P (d.vmode));
13645   d.testing_p = false;
13646
13647   d.perm.reserve (nelt);
13648   for (i = which = 0; i < nelt; ++i)
13649     {
13650       rtx e = XVECEXP (sel, 0, i);
13651       unsigned int ei = INTVAL (e) & (2 * nelt - 1);
13652       which |= (ei < nelt ? 1 : 2);
13653       d.perm.quick_push (ei);
13654     }
13655
13656   switch (which)
13657     {
13658     default:
13659       gcc_unreachable ();
13660
13661     case 3:
13662       d.one_vector_p = false;
13663       if (!rtx_equal_p (op0, op1))
13664         break;
13665
13666       /* The elements of PERM do not suggest that only the first operand
13667          is used, but both operands are identical.  Allow easier matching
13668          of the permutation by folding the permutation into the single
13669          input vector.  */
13670       /* Fall Through.  */
13671     case 2:
13672       for (i = 0; i < nelt; ++i)
13673         d.perm[i] &= nelt - 1;
13674       d.op0 = op1;
13675       d.one_vector_p = true;
13676       break;
13677
13678     case 1:
13679       d.op1 = op0;
13680       d.one_vector_p = true;
13681       break;
13682     }
13683
13684   return aarch64_expand_vec_perm_const_1 (&d);
13685 }
13686
13687 static bool
13688 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
13689 {
13690   struct expand_vec_perm_d d;
13691   unsigned int i, nelt, which;
13692   bool ret;
13693
13694   d.vmode = vmode;
13695   d.testing_p = true;
13696   d.perm.safe_splice (sel);
13697
13698   /* Calculate whether all elements are in one vector.  */
13699   nelt = sel.length ();
13700   for (i = which = 0; i < nelt; ++i)
13701     {
13702       unsigned int e = d.perm[i];
13703       gcc_assert (e < 2 * nelt);
13704       which |= (e < nelt ? 1 : 2);
13705     }
13706
13707   /* If all elements are from the second vector, reindex as if from the
13708      first vector.  */
13709   if (which == 2)
13710     for (i = 0; i < nelt; ++i)
13711       d.perm[i] -= nelt;
13712
13713   /* Check whether the mask can be applied to a single vector.  */
13714   d.one_vector_p = (which != 3);
13715
13716   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13717   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13718   if (!d.one_vector_p)
13719     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13720
13721   start_sequence ();
13722   ret = aarch64_expand_vec_perm_const_1 (&d);
13723   end_sequence ();
13724
13725   return ret;
13726 }
13727
13728 /* Generate a byte permute mask for a register of mode MODE,
13729    which has NUNITS units.  */
13730
13731 rtx
13732 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
13733 {
13734   /* We have to reverse each vector because we dont have
13735      a permuted load that can reverse-load according to ABI rules.  */
13736   rtx mask;
13737   rtvec v = rtvec_alloc (16);
13738   unsigned int i, j;
13739   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
13740
13741   gcc_assert (BYTES_BIG_ENDIAN);
13742   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13743
13744   for (i = 0; i < nunits; i++)
13745     for (j = 0; j < usize; j++)
13746       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13747   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13748   return force_reg (V16QImode, mask);
13749 }
13750
13751 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
13752    true.  However due to issues with register allocation it is preferable
13753    to avoid tieing integer scalar and FP scalar modes.  Executing integer
13754    operations in general registers is better than treating them as scalar
13755    vector operations.  This reduces latency and avoids redundant int<->FP
13756    moves.  So tie modes if they are either the same class, or vector modes
13757    with other vector modes, vector structs or any scalar mode.  */
13758
13759 static bool
13760 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13761 {
13762   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13763     return true;
13764
13765   /* We specifically want to allow elements of "structure" modes to
13766      be tieable to the structure.  This more general condition allows
13767      other rarer situations too.  */
13768   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13769     return true;
13770
13771   /* Also allow any scalar modes with vectors.  */
13772   if (aarch64_vector_mode_supported_p (mode1)
13773       || aarch64_vector_mode_supported_p (mode2))
13774     return true;
13775
13776   return false;
13777 }
13778
13779 /* Return a new RTX holding the result of moving POINTER forward by
13780    AMOUNT bytes.  */
13781
13782 static rtx
13783 aarch64_move_pointer (rtx pointer, int amount)
13784 {
13785   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13786
13787   return adjust_automodify_address (pointer, GET_MODE (pointer),
13788                                     next, amount);
13789 }
13790
13791 /* Return a new RTX holding the result of moving POINTER forward by the
13792    size of the mode it points to.  */
13793
13794 static rtx
13795 aarch64_progress_pointer (rtx pointer)
13796 {
13797   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13798
13799   return aarch64_move_pointer (pointer, amount);
13800 }
13801
13802 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13803    MODE bytes.  */
13804
13805 static void
13806 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13807                                               machine_mode mode)
13808 {
13809   rtx reg = gen_reg_rtx (mode);
13810
13811   /* "Cast" the pointers to the correct mode.  */
13812   *src = adjust_address (*src, mode, 0);
13813   *dst = adjust_address (*dst, mode, 0);
13814   /* Emit the memcpy.  */
13815   emit_move_insn (reg, *src);
13816   emit_move_insn (*dst, reg);
13817   /* Move the pointers forward.  */
13818   *src = aarch64_progress_pointer (*src);
13819   *dst = aarch64_progress_pointer (*dst);
13820 }
13821
13822 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13823    we succeed, otherwise return false.  */
13824
13825 bool
13826 aarch64_expand_movmem (rtx *operands)
13827 {
13828   unsigned int n;
13829   rtx dst = operands[0];
13830   rtx src = operands[1];
13831   rtx base;
13832   bool speed_p = !optimize_function_for_size_p (cfun);
13833
13834   /* When optimizing for size, give a better estimate of the length of a
13835      memcpy call, but use the default otherwise.  */
13836   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13837
13838   /* We can't do anything smart if the amount to copy is not constant.  */
13839   if (!CONST_INT_P (operands[2]))
13840     return false;
13841
13842   n = UINTVAL (operands[2]);
13843
13844   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13845      need to make at most two moves.  For cases above 16 bytes it will be one
13846      move for each 16 byte chunk, then at most two additional moves.  */
13847   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13848     return false;
13849
13850   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13851   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13852
13853   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13854   src = adjust_automodify_address (src, VOIDmode, base, 0);
13855
13856   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13857      1-byte chunk.  */
13858   if (n < 4)
13859     {
13860       if (n >= 2)
13861         {
13862           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13863           n -= 2;
13864         }
13865
13866       if (n == 1)
13867         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13868
13869       return true;
13870     }
13871
13872   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13873      4-byte chunk, partially overlapping with the previously copied chunk.  */
13874   if (n < 8)
13875     {
13876       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13877       n -= 4;
13878       if (n > 0)
13879         {
13880           int move = n - 4;
13881
13882           src = aarch64_move_pointer (src, move);
13883           dst = aarch64_move_pointer (dst, move);
13884           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13885         }
13886       return true;
13887     }
13888
13889   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13890      them, then (if applicable) an 8-byte chunk.  */
13891   while (n >= 8)
13892     {
13893       if (n / 16)
13894         {
13895           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13896           n -= 16;
13897         }
13898       else
13899         {
13900           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13901           n -= 8;
13902         }
13903     }
13904
13905   /* Finish the final bytes of the copy.  We can always do this in one
13906      instruction.  We either copy the exact amount we need, or partially
13907      overlap with the previous chunk we copied and copy 8-bytes.  */
13908   if (n == 0)
13909     return true;
13910   else if (n == 1)
13911     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13912   else if (n == 2)
13913     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13914   else if (n == 4)
13915     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13916   else
13917     {
13918       if (n == 3)
13919         {
13920           src = aarch64_move_pointer (src, -1);
13921           dst = aarch64_move_pointer (dst, -1);
13922           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13923         }
13924       else
13925         {
13926           int move = n - 8;
13927
13928           src = aarch64_move_pointer (src, move);
13929           dst = aarch64_move_pointer (dst, move);
13930           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13931         }
13932     }
13933
13934   return true;
13935 }
13936
13937 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13938    SImode stores.  Handle the case when the constant has identical
13939    bottom and top halves.  This is beneficial when the two stores can be
13940    merged into an STP and we avoid synthesising potentially expensive
13941    immediates twice.  Return true if such a split is possible.  */
13942
13943 bool
13944 aarch64_split_dimode_const_store (rtx dst, rtx src)
13945 {
13946   rtx lo = gen_lowpart (SImode, src);
13947   rtx hi = gen_highpart_mode (SImode, DImode, src);
13948
13949   bool size_p = optimize_function_for_size_p (cfun);
13950
13951   if (!rtx_equal_p (lo, hi))
13952     return false;
13953
13954   unsigned int orig_cost
13955     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13956   unsigned int lo_cost
13957     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13958
13959   /* We want to transform:
13960      MOV        x1, 49370
13961      MOVK       x1, 0x140, lsl 16
13962      MOVK       x1, 0xc0da, lsl 32
13963      MOVK       x1, 0x140, lsl 48
13964      STR        x1, [x0]
13965    into:
13966      MOV        w1, 49370
13967      MOVK       w1, 0x140, lsl 16
13968      STP        w1, w1, [x0]
13969    So we want to perform this only when we save two instructions
13970    or more.  When optimizing for size, however, accept any code size
13971    savings we can.  */
13972   if (size_p && orig_cost <= lo_cost)
13973     return false;
13974
13975   if (!size_p
13976       && (orig_cost <= lo_cost + 1))
13977     return false;
13978
13979   rtx mem_lo = adjust_address (dst, SImode, 0);
13980   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13981     return false;
13982
13983   rtx tmp_reg = gen_reg_rtx (SImode);
13984   aarch64_expand_mov_immediate (tmp_reg, lo);
13985   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13986   /* Don't emit an explicit store pair as this may not be always profitable.
13987      Let the sched-fusion logic decide whether to merge them.  */
13988   emit_move_insn (mem_lo, tmp_reg);
13989   emit_move_insn (mem_hi, tmp_reg);
13990
13991   return true;
13992 }
13993
13994 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13995
13996 static unsigned HOST_WIDE_INT
13997 aarch64_asan_shadow_offset (void)
13998 {
13999   return (HOST_WIDE_INT_1 << 36);
14000 }
14001
14002 static rtx
14003 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14004                         int code, tree treeop0, tree treeop1)
14005 {
14006   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14007   rtx op0, op1;
14008   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14009   insn_code icode;
14010   struct expand_operand ops[4];
14011
14012   start_sequence ();
14013   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14014
14015   op_mode = GET_MODE (op0);
14016   if (op_mode == VOIDmode)
14017     op_mode = GET_MODE (op1);
14018
14019   switch (op_mode)
14020     {
14021     case E_QImode:
14022     case E_HImode:
14023     case E_SImode:
14024       cmp_mode = SImode;
14025       icode = CODE_FOR_cmpsi;
14026       break;
14027
14028     case E_DImode:
14029       cmp_mode = DImode;
14030       icode = CODE_FOR_cmpdi;
14031       break;
14032
14033     case E_SFmode:
14034       cmp_mode = SFmode;
14035       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14036       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14037       break;
14038
14039     case E_DFmode:
14040       cmp_mode = DFmode;
14041       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14042       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14043       break;
14044
14045     default:
14046       end_sequence ();
14047       return NULL_RTX;
14048     }
14049
14050   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14051   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14052   if (!op0 || !op1)
14053     {
14054       end_sequence ();
14055       return NULL_RTX;
14056     }
14057   *prep_seq = get_insns ();
14058   end_sequence ();
14059
14060   create_fixed_operand (&ops[0], op0);
14061   create_fixed_operand (&ops[1], op1);
14062
14063   start_sequence ();
14064   if (!maybe_expand_insn (icode, 2, ops))
14065     {
14066       end_sequence ();
14067       return NULL_RTX;
14068     }
14069   *gen_seq = get_insns ();
14070   end_sequence ();
14071
14072   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14073                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14074 }
14075
14076 static rtx
14077 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14078                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14079 {
14080   rtx op0, op1, target;
14081   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14082   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14083   insn_code icode;
14084   struct expand_operand ops[6];
14085   int aarch64_cond;
14086
14087   push_to_sequence (*prep_seq);
14088   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14089
14090   op_mode = GET_MODE (op0);
14091   if (op_mode == VOIDmode)
14092     op_mode = GET_MODE (op1);
14093
14094   switch (op_mode)
14095     {
14096     case E_QImode:
14097     case E_HImode:
14098     case E_SImode:
14099       cmp_mode = SImode;
14100       icode = CODE_FOR_ccmpsi;
14101       break;
14102
14103     case E_DImode:
14104       cmp_mode = DImode;
14105       icode = CODE_FOR_ccmpdi;
14106       break;
14107
14108     case E_SFmode:
14109       cmp_mode = SFmode;
14110       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14111       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14112       break;
14113
14114     case E_DFmode:
14115       cmp_mode = DFmode;
14116       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14117       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14118       break;
14119
14120     default:
14121       end_sequence ();
14122       return NULL_RTX;
14123     }
14124
14125   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14126   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14127   if (!op0 || !op1)
14128     {
14129       end_sequence ();
14130       return NULL_RTX;
14131     }
14132   *prep_seq = get_insns ();
14133   end_sequence ();
14134
14135   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14136   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14137
14138   if (bit_code != AND)
14139     {
14140       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14141                                                 GET_MODE (XEXP (prev, 0))),
14142                              VOIDmode, XEXP (prev, 0), const0_rtx);
14143       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14144     }
14145
14146   create_fixed_operand (&ops[0], XEXP (prev, 0));
14147   create_fixed_operand (&ops[1], target);
14148   create_fixed_operand (&ops[2], op0);
14149   create_fixed_operand (&ops[3], op1);
14150   create_fixed_operand (&ops[4], prev);
14151   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14152
14153   push_to_sequence (*gen_seq);
14154   if (!maybe_expand_insn (icode, 6, ops))
14155     {
14156       end_sequence ();
14157       return NULL_RTX;
14158     }
14159
14160   *gen_seq = get_insns ();
14161   end_sequence ();
14162
14163   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14164 }
14165
14166 #undef TARGET_GEN_CCMP_FIRST
14167 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14168
14169 #undef TARGET_GEN_CCMP_NEXT
14170 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14171
14172 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14173    instruction fusion of some sort.  */
14174
14175 static bool
14176 aarch64_macro_fusion_p (void)
14177 {
14178   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14179 }
14180
14181
14182 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14183    should be kept together during scheduling.  */
14184
14185 static bool
14186 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14187 {
14188   rtx set_dest;
14189   rtx prev_set = single_set (prev);
14190   rtx curr_set = single_set (curr);
14191   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14192   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14193
14194   if (!aarch64_macro_fusion_p ())
14195     return false;
14196
14197   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14198     {
14199       /* We are trying to match:
14200          prev (mov)  == (set (reg r0) (const_int imm16))
14201          curr (movk) == (set (zero_extract (reg r0)
14202                                            (const_int 16)
14203                                            (const_int 16))
14204                              (const_int imm16_1))  */
14205
14206       set_dest = SET_DEST (curr_set);
14207
14208       if (GET_CODE (set_dest) == ZERO_EXTRACT
14209           && CONST_INT_P (SET_SRC (curr_set))
14210           && CONST_INT_P (SET_SRC (prev_set))
14211           && CONST_INT_P (XEXP (set_dest, 2))
14212           && INTVAL (XEXP (set_dest, 2)) == 16
14213           && REG_P (XEXP (set_dest, 0))
14214           && REG_P (SET_DEST (prev_set))
14215           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14216         {
14217           return true;
14218         }
14219     }
14220
14221   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14222     {
14223
14224       /*  We're trying to match:
14225           prev (adrp) == (set (reg r1)
14226                               (high (symbol_ref ("SYM"))))
14227           curr (add) == (set (reg r0)
14228                              (lo_sum (reg r1)
14229                                      (symbol_ref ("SYM"))))
14230           Note that r0 need not necessarily be the same as r1, especially
14231           during pre-regalloc scheduling.  */
14232
14233       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14234           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14235         {
14236           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14237               && REG_P (XEXP (SET_SRC (curr_set), 0))
14238               && REGNO (XEXP (SET_SRC (curr_set), 0))
14239                  == REGNO (SET_DEST (prev_set))
14240               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14241                               XEXP (SET_SRC (curr_set), 1)))
14242             return true;
14243         }
14244     }
14245
14246   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14247     {
14248
14249       /* We're trying to match:
14250          prev (movk) == (set (zero_extract (reg r0)
14251                                            (const_int 16)
14252                                            (const_int 32))
14253                              (const_int imm16_1))
14254          curr (movk) == (set (zero_extract (reg r0)
14255                                            (const_int 16)
14256                                            (const_int 48))
14257                              (const_int imm16_2))  */
14258
14259       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14260           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14261           && REG_P (XEXP (SET_DEST (prev_set), 0))
14262           && REG_P (XEXP (SET_DEST (curr_set), 0))
14263           && REGNO (XEXP (SET_DEST (prev_set), 0))
14264              == REGNO (XEXP (SET_DEST (curr_set), 0))
14265           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14266           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14267           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14268           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14269           && CONST_INT_P (SET_SRC (prev_set))
14270           && CONST_INT_P (SET_SRC (curr_set)))
14271         return true;
14272
14273     }
14274   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14275     {
14276       /* We're trying to match:
14277           prev (adrp) == (set (reg r0)
14278                               (high (symbol_ref ("SYM"))))
14279           curr (ldr) == (set (reg r1)
14280                              (mem (lo_sum (reg r0)
14281                                              (symbol_ref ("SYM")))))
14282                  or
14283           curr (ldr) == (set (reg r1)
14284                              (zero_extend (mem
14285                                            (lo_sum (reg r0)
14286                                                    (symbol_ref ("SYM"))))))  */
14287       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14288           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14289         {
14290           rtx curr_src = SET_SRC (curr_set);
14291
14292           if (GET_CODE (curr_src) == ZERO_EXTEND)
14293             curr_src = XEXP (curr_src, 0);
14294
14295           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14296               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14297               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14298                  == REGNO (SET_DEST (prev_set))
14299               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14300                               XEXP (SET_SRC (prev_set), 0)))
14301               return true;
14302         }
14303     }
14304
14305   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14306        && aarch_crypto_can_dual_issue (prev, curr))
14307     return true;
14308
14309   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14310       && any_condjump_p (curr))
14311     {
14312       enum attr_type prev_type = get_attr_type (prev);
14313
14314       unsigned int condreg1, condreg2;
14315       rtx cc_reg_1;
14316       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14317       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14318
14319       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14320           && prev
14321           && modified_in_p (cc_reg_1, prev))
14322         {
14323           /* FIXME: this misses some which is considered simple arthematic
14324              instructions for ThunderX.  Simple shifts are missed here.  */
14325           if (prev_type == TYPE_ALUS_SREG
14326               || prev_type == TYPE_ALUS_IMM
14327               || prev_type == TYPE_LOGICS_REG
14328               || prev_type == TYPE_LOGICS_IMM)
14329             return true;
14330         }
14331     }
14332
14333   if (prev_set
14334       && curr_set
14335       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14336       && any_condjump_p (curr))
14337     {
14338       /* We're trying to match:
14339           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14340           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14341                                                          (const_int 0))
14342                                                  (label_ref ("SYM"))
14343                                                  (pc))  */
14344       if (SET_DEST (curr_set) == (pc_rtx)
14345           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14346           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14347           && REG_P (SET_DEST (prev_set))
14348           && REGNO (SET_DEST (prev_set))
14349              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14350         {
14351           /* Fuse ALU operations followed by conditional branch instruction.  */
14352           switch (get_attr_type (prev))
14353             {
14354             case TYPE_ALU_IMM:
14355             case TYPE_ALU_SREG:
14356             case TYPE_ADC_REG:
14357             case TYPE_ADC_IMM:
14358             case TYPE_ADCS_REG:
14359             case TYPE_ADCS_IMM:
14360             case TYPE_LOGIC_REG:
14361             case TYPE_LOGIC_IMM:
14362             case TYPE_CSEL:
14363             case TYPE_ADR:
14364             case TYPE_MOV_IMM:
14365             case TYPE_SHIFT_REG:
14366             case TYPE_SHIFT_IMM:
14367             case TYPE_BFM:
14368             case TYPE_RBIT:
14369             case TYPE_REV:
14370             case TYPE_EXTEND:
14371               return true;
14372
14373             default:;
14374             }
14375         }
14376     }
14377
14378   return false;
14379 }
14380
14381 /* Return true iff the instruction fusion described by OP is enabled.  */
14382
14383 bool
14384 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14385 {
14386   return (aarch64_tune_params.fusible_ops & op) != 0;
14387 }
14388
14389 /* If MEM is in the form of [base+offset], extract the two parts
14390    of address and set to BASE and OFFSET, otherwise return false
14391    after clearing BASE and OFFSET.  */
14392
14393 bool
14394 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14395 {
14396   rtx addr;
14397
14398   gcc_assert (MEM_P (mem));
14399
14400   addr = XEXP (mem, 0);
14401
14402   if (REG_P (addr))
14403     {
14404       *base = addr;
14405       *offset = const0_rtx;
14406       return true;
14407     }
14408
14409   if (GET_CODE (addr) == PLUS
14410       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14411     {
14412       *base = XEXP (addr, 0);
14413       *offset = XEXP (addr, 1);
14414       return true;
14415     }
14416
14417   *base = NULL_RTX;
14418   *offset = NULL_RTX;
14419
14420   return false;
14421 }
14422
14423 /* Types for scheduling fusion.  */
14424 enum sched_fusion_type
14425 {
14426   SCHED_FUSION_NONE = 0,
14427   SCHED_FUSION_LD_SIGN_EXTEND,
14428   SCHED_FUSION_LD_ZERO_EXTEND,
14429   SCHED_FUSION_LD,
14430   SCHED_FUSION_ST,
14431   SCHED_FUSION_NUM
14432 };
14433
14434 /* If INSN is a load or store of address in the form of [base+offset],
14435    extract the two parts and set to BASE and OFFSET.  Return scheduling
14436    fusion type this INSN is.  */
14437
14438 static enum sched_fusion_type
14439 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14440 {
14441   rtx x, dest, src;
14442   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14443
14444   gcc_assert (INSN_P (insn));
14445   x = PATTERN (insn);
14446   if (GET_CODE (x) != SET)
14447     return SCHED_FUSION_NONE;
14448
14449   src = SET_SRC (x);
14450   dest = SET_DEST (x);
14451
14452   machine_mode dest_mode = GET_MODE (dest);
14453
14454   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14455     return SCHED_FUSION_NONE;
14456
14457   if (GET_CODE (src) == SIGN_EXTEND)
14458     {
14459       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14460       src = XEXP (src, 0);
14461       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14462         return SCHED_FUSION_NONE;
14463     }
14464   else if (GET_CODE (src) == ZERO_EXTEND)
14465     {
14466       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14467       src = XEXP (src, 0);
14468       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14469         return SCHED_FUSION_NONE;
14470     }
14471
14472   if (GET_CODE (src) == MEM && REG_P (dest))
14473     extract_base_offset_in_addr (src, base, offset);
14474   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14475     {
14476       fusion = SCHED_FUSION_ST;
14477       extract_base_offset_in_addr (dest, base, offset);
14478     }
14479   else
14480     return SCHED_FUSION_NONE;
14481
14482   if (*base == NULL_RTX || *offset == NULL_RTX)
14483     fusion = SCHED_FUSION_NONE;
14484
14485   return fusion;
14486 }
14487
14488 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14489
14490    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14491    and PRI are only calculated for these instructions.  For other instruction,
14492    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14493    type instruction fusion can be added by returning different priorities.
14494
14495    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14496
14497 static void
14498 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14499                                int *fusion_pri, int *pri)
14500 {
14501   int tmp, off_val;
14502   rtx base, offset;
14503   enum sched_fusion_type fusion;
14504
14505   gcc_assert (INSN_P (insn));
14506
14507   tmp = max_pri - 1;
14508   fusion = fusion_load_store (insn, &base, &offset);
14509   if (fusion == SCHED_FUSION_NONE)
14510     {
14511       *pri = tmp;
14512       *fusion_pri = tmp;
14513       return;
14514     }
14515
14516   /* Set FUSION_PRI according to fusion type and base register.  */
14517   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14518
14519   /* Calculate PRI.  */
14520   tmp /= 2;
14521
14522   /* INSN with smaller offset goes first.  */
14523   off_val = (int)(INTVAL (offset));
14524   if (off_val >= 0)
14525     tmp -= (off_val & 0xfffff);
14526   else
14527     tmp += ((- off_val) & 0xfffff);
14528
14529   *pri = tmp;
14530   return;
14531 }
14532
14533 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14534    Adjust priority of sha1h instructions so they are scheduled before
14535    other SHA1 instructions.  */
14536
14537 static int
14538 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14539 {
14540   rtx x = PATTERN (insn);
14541
14542   if (GET_CODE (x) == SET)
14543     {
14544       x = SET_SRC (x);
14545
14546       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14547         return priority + 10;
14548     }
14549
14550   return priority;
14551 }
14552
14553 /* Given OPERANDS of consecutive load/store, check if we can merge
14554    them into ldp/stp.  LOAD is true if they are load instructions.
14555    MODE is the mode of memory operands.  */
14556
14557 bool
14558 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14559                                 machine_mode mode)
14560 {
14561   HOST_WIDE_INT offval_1, offval_2, msize;
14562   enum reg_class rclass_1, rclass_2;
14563   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14564
14565   if (load)
14566     {
14567       mem_1 = operands[1];
14568       mem_2 = operands[3];
14569       reg_1 = operands[0];
14570       reg_2 = operands[2];
14571       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14572       if (REGNO (reg_1) == REGNO (reg_2))
14573         return false;
14574     }
14575   else
14576     {
14577       mem_1 = operands[0];
14578       mem_2 = operands[2];
14579       reg_1 = operands[1];
14580       reg_2 = operands[3];
14581     }
14582
14583   /* The mems cannot be volatile.  */
14584   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14585     return false;
14586
14587   /* If we have SImode and slow unaligned ldp,
14588      check the alignment to be at least 8 byte. */
14589   if (mode == SImode
14590       && (aarch64_tune_params.extra_tuning_flags
14591           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14592       && !optimize_size
14593       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14594     return false;
14595
14596   /* Check if the addresses are in the form of [base+offset].  */
14597   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14598   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14599     return false;
14600   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14601   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14602     return false;
14603
14604   /* Check if the bases are same.  */
14605   if (!rtx_equal_p (base_1, base_2))
14606     return false;
14607
14608   offval_1 = INTVAL (offset_1);
14609   offval_2 = INTVAL (offset_2);
14610   msize = GET_MODE_SIZE (mode);
14611   /* Check if the offsets are consecutive.  */
14612   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14613     return false;
14614
14615   /* Check if the addresses are clobbered by load.  */
14616   if (load)
14617     {
14618       if (reg_mentioned_p (reg_1, mem_1))
14619         return false;
14620
14621       /* In increasing order, the last load can clobber the address.  */
14622       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14623       return false;
14624     }
14625
14626   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14627     rclass_1 = FP_REGS;
14628   else
14629     rclass_1 = GENERAL_REGS;
14630
14631   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14632     rclass_2 = FP_REGS;
14633   else
14634     rclass_2 = GENERAL_REGS;
14635
14636   /* Check if the registers are of same class.  */
14637   if (rclass_1 != rclass_2)
14638     return false;
14639
14640   return true;
14641 }
14642
14643 /* Given OPERANDS of consecutive load/store, check if we can merge
14644    them into ldp/stp by adjusting the offset.  LOAD is true if they
14645    are load instructions.  MODE is the mode of memory operands.
14646
14647    Given below consecutive stores:
14648
14649      str  w1, [xb, 0x100]
14650      str  w1, [xb, 0x104]
14651      str  w1, [xb, 0x108]
14652      str  w1, [xb, 0x10c]
14653
14654    Though the offsets are out of the range supported by stp, we can
14655    still pair them after adjusting the offset, like:
14656
14657      add  scratch, xb, 0x100
14658      stp  w1, w1, [scratch]
14659      stp  w1, w1, [scratch, 0x8]
14660
14661    The peephole patterns detecting this opportunity should guarantee
14662    the scratch register is avaliable.  */
14663
14664 bool
14665 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14666                                        scalar_mode mode)
14667 {
14668   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14669   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14670   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14671   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14672
14673   if (load)
14674     {
14675       reg_1 = operands[0];
14676       mem_1 = operands[1];
14677       reg_2 = operands[2];
14678       mem_2 = operands[3];
14679       reg_3 = operands[4];
14680       mem_3 = operands[5];
14681       reg_4 = operands[6];
14682       mem_4 = operands[7];
14683       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14684                   && REG_P (reg_3) && REG_P (reg_4));
14685       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14686         return false;
14687     }
14688   else
14689     {
14690       mem_1 = operands[0];
14691       reg_1 = operands[1];
14692       mem_2 = operands[2];
14693       reg_2 = operands[3];
14694       mem_3 = operands[4];
14695       reg_3 = operands[5];
14696       mem_4 = operands[6];
14697       reg_4 = operands[7];
14698     }
14699   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14700   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14701     return false;
14702
14703   /* The mems cannot be volatile.  */
14704   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14705       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14706     return false;
14707
14708   /* Check if the addresses are in the form of [base+offset].  */
14709   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14710   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14711     return false;
14712   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14713   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14714     return false;
14715   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14716   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14717     return false;
14718   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14719   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14720     return false;
14721
14722   /* Check if the bases are same.  */
14723   if (!rtx_equal_p (base_1, base_2)
14724       || !rtx_equal_p (base_2, base_3)
14725       || !rtx_equal_p (base_3, base_4))
14726     return false;
14727
14728   offval_1 = INTVAL (offset_1);
14729   offval_2 = INTVAL (offset_2);
14730   offval_3 = INTVAL (offset_3);
14731   offval_4 = INTVAL (offset_4);
14732   msize = GET_MODE_SIZE (mode);
14733   /* Check if the offsets are consecutive.  */
14734   if ((offval_1 != (offval_2 + msize)
14735        || offval_1 != (offval_3 + msize * 2)
14736        || offval_1 != (offval_4 + msize * 3))
14737       && (offval_4 != (offval_3 + msize)
14738           || offval_4 != (offval_2 + msize * 2)
14739           || offval_4 != (offval_1 + msize * 3)))
14740     return false;
14741
14742   /* Check if the addresses are clobbered by load.  */
14743   if (load)
14744     {
14745       if (reg_mentioned_p (reg_1, mem_1)
14746           || reg_mentioned_p (reg_2, mem_2)
14747           || reg_mentioned_p (reg_3, mem_3))
14748         return false;
14749
14750       /* In increasing order, the last load can clobber the address.  */
14751       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14752         return false;
14753     }
14754
14755   /* If we have SImode and slow unaligned ldp,
14756      check the alignment to be at least 8 byte. */
14757   if (mode == SImode
14758       && (aarch64_tune_params.extra_tuning_flags
14759           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14760       && !optimize_size
14761       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14762     return false;
14763
14764   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14765     rclass_1 = FP_REGS;
14766   else
14767     rclass_1 = GENERAL_REGS;
14768
14769   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14770     rclass_2 = FP_REGS;
14771   else
14772     rclass_2 = GENERAL_REGS;
14773
14774   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14775     rclass_3 = FP_REGS;
14776   else
14777     rclass_3 = GENERAL_REGS;
14778
14779   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14780     rclass_4 = FP_REGS;
14781   else
14782     rclass_4 = GENERAL_REGS;
14783
14784   /* Check if the registers are of same class.  */
14785   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14786     return false;
14787
14788   return true;
14789 }
14790
14791 /* Given OPERANDS of consecutive load/store, this function pairs them
14792    into ldp/stp after adjusting the offset.  It depends on the fact
14793    that addresses of load/store instructions are in increasing order.
14794    MODE is the mode of memory operands.  CODE is the rtl operator
14795    which should be applied to all memory operands, it's SIGN_EXTEND,
14796    ZERO_EXTEND or UNKNOWN.  */
14797
14798 bool
14799 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14800                              scalar_mode mode, RTX_CODE code)
14801 {
14802   rtx base, offset, t1, t2;
14803   rtx mem_1, mem_2, mem_3, mem_4;
14804   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14805
14806   if (load)
14807     {
14808       mem_1 = operands[1];
14809       mem_2 = operands[3];
14810       mem_3 = operands[5];
14811       mem_4 = operands[7];
14812     }
14813   else
14814     {
14815       mem_1 = operands[0];
14816       mem_2 = operands[2];
14817       mem_3 = operands[4];
14818       mem_4 = operands[6];
14819       gcc_assert (code == UNKNOWN);
14820     }
14821
14822   extract_base_offset_in_addr (mem_1, &base, &offset);
14823   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14824
14825   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14826   msize = GET_MODE_SIZE (mode);
14827   stp_off_limit = msize * 0x40;
14828   off_val = INTVAL (offset);
14829   abs_off = (off_val < 0) ? -off_val : off_val;
14830   new_off = abs_off % stp_off_limit;
14831   adj_off = abs_off - new_off;
14832
14833   /* Further adjust to make sure all offsets are OK.  */
14834   if ((new_off + msize * 2) >= stp_off_limit)
14835     {
14836       adj_off += stp_off_limit;
14837       new_off -= stp_off_limit;
14838     }
14839
14840   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14841   if (adj_off >= 0x1000)
14842     return false;
14843
14844   if (off_val < 0)
14845     {
14846       adj_off = -adj_off;
14847       new_off = -new_off;
14848     }
14849
14850   /* Create new memory references.  */
14851   mem_1 = change_address (mem_1, VOIDmode,
14852                           plus_constant (DImode, operands[8], new_off));
14853
14854   /* Check if the adjusted address is OK for ldp/stp.  */
14855   if (!aarch64_mem_pair_operand (mem_1, mode))
14856     return false;
14857
14858   msize = GET_MODE_SIZE (mode);
14859   mem_2 = change_address (mem_2, VOIDmode,
14860                           plus_constant (DImode,
14861                                          operands[8],
14862                                          new_off + msize));
14863   mem_3 = change_address (mem_3, VOIDmode,
14864                           plus_constant (DImode,
14865                                          operands[8],
14866                                          new_off + msize * 2));
14867   mem_4 = change_address (mem_4, VOIDmode,
14868                           plus_constant (DImode,
14869                                          operands[8],
14870                                          new_off + msize * 3));
14871
14872   if (code == ZERO_EXTEND)
14873     {
14874       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14875       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14876       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14877       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14878     }
14879   else if (code == SIGN_EXTEND)
14880     {
14881       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14882       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14883       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14884       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14885     }
14886
14887   if (load)
14888     {
14889       operands[1] = mem_1;
14890       operands[3] = mem_2;
14891       operands[5] = mem_3;
14892       operands[7] = mem_4;
14893     }
14894   else
14895     {
14896       operands[0] = mem_1;
14897       operands[2] = mem_2;
14898       operands[4] = mem_3;
14899       operands[6] = mem_4;
14900     }
14901
14902   /* Emit adjusting instruction.  */
14903   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14904   /* Emit ldp/stp instructions.  */
14905   t1 = gen_rtx_SET (operands[0], operands[1]);
14906   t2 = gen_rtx_SET (operands[2], operands[3]);
14907   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14908   t1 = gen_rtx_SET (operands[4], operands[5]);
14909   t2 = gen_rtx_SET (operands[6], operands[7]);
14910   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14911   return true;
14912 }
14913
14914 /* Return 1 if pseudo register should be created and used to hold
14915    GOT address for PIC code.  */
14916
14917 bool
14918 aarch64_use_pseudo_pic_reg (void)
14919 {
14920   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14921 }
14922
14923 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14924
14925 static int
14926 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14927 {
14928   switch (XINT (x, 1))
14929     {
14930     case UNSPEC_GOTSMALLPIC:
14931     case UNSPEC_GOTSMALLPIC28K:
14932     case UNSPEC_GOTTINYPIC:
14933       return 0;
14934     default:
14935       break;
14936     }
14937
14938   return default_unspec_may_trap_p (x, flags);
14939 }
14940
14941
14942 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14943    return the log2 of that value.  Otherwise return -1.  */
14944
14945 int
14946 aarch64_fpconst_pow_of_2 (rtx x)
14947 {
14948   const REAL_VALUE_TYPE *r;
14949
14950   if (!CONST_DOUBLE_P (x))
14951     return -1;
14952
14953   r = CONST_DOUBLE_REAL_VALUE (x);
14954
14955   if (REAL_VALUE_NEGATIVE (*r)
14956       || REAL_VALUE_ISNAN (*r)
14957       || REAL_VALUE_ISINF (*r)
14958       || !real_isinteger (r, DFmode))
14959     return -1;
14960
14961   return exact_log2 (real_to_integer (r));
14962 }
14963
14964 /* If X is a vector of equal CONST_DOUBLE values and that value is
14965    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14966
14967 int
14968 aarch64_vec_fpconst_pow_of_2 (rtx x)
14969 {
14970   if (GET_CODE (x) != CONST_VECTOR)
14971     return -1;
14972
14973   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14974     return -1;
14975
14976   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14977   if (firstval <= 0)
14978     return -1;
14979
14980   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14981     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14982       return -1;
14983
14984   return firstval;
14985 }
14986
14987 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14988    to float.
14989
14990    __fp16 always promotes through this hook.
14991    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14992    through the generic excess precision logic rather than here.  */
14993
14994 static tree
14995 aarch64_promoted_type (const_tree t)
14996 {
14997   if (SCALAR_FLOAT_TYPE_P (t)
14998       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14999     return float_type_node;
15000
15001   return NULL_TREE;
15002 }
15003
15004 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15005
15006 static bool
15007 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15008                            optimization_type opt_type)
15009 {
15010   switch (op)
15011     {
15012     case rsqrt_optab:
15013       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15014
15015     default:
15016       return true;
15017     }
15018 }
15019
15020 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15021    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15022
15023 static bool
15024 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15025 {
15026   return (mode == HFmode
15027           ? true
15028           : default_libgcc_floating_mode_supported_p (mode));
15029 }
15030
15031 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15032    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15033
15034 static bool
15035 aarch64_scalar_mode_supported_p (scalar_mode mode)
15036 {
15037   return (mode == HFmode
15038           ? true
15039           : default_scalar_mode_supported_p (mode));
15040 }
15041
15042 /* Set the value of FLT_EVAL_METHOD.
15043    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15044
15045     0: evaluate all operations and constants, whose semantic type has at
15046        most the range and precision of type float, to the range and
15047        precision of float; evaluate all other operations and constants to
15048        the range and precision of the semantic type;
15049
15050     N, where _FloatN is a supported interchange floating type
15051        evaluate all operations and constants, whose semantic type has at
15052        most the range and precision of _FloatN type, to the range and
15053        precision of the _FloatN type; evaluate all other operations and
15054        constants to the range and precision of the semantic type;
15055
15056    If we have the ARMv8.2-A extensions then we support _Float16 in native
15057    precision, so we should set this to 16.  Otherwise, we support the type,
15058    but want to evaluate expressions in float precision, so set this to
15059    0.  */
15060
15061 static enum flt_eval_method
15062 aarch64_excess_precision (enum excess_precision_type type)
15063 {
15064   switch (type)
15065     {
15066       case EXCESS_PRECISION_TYPE_FAST:
15067       case EXCESS_PRECISION_TYPE_STANDARD:
15068         /* We can calculate either in 16-bit range and precision or
15069            32-bit range and precision.  Make that decision based on whether
15070            we have native support for the ARMv8.2-A 16-bit floating-point
15071            instructions or not.  */
15072         return (TARGET_FP_F16INST
15073                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15074                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15075       case EXCESS_PRECISION_TYPE_IMPLICIT:
15076         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15077       default:
15078         gcc_unreachable ();
15079     }
15080   return FLT_EVAL_METHOD_UNPREDICTABLE;
15081 }
15082
15083 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15084    scheduled for speculative execution.  Reject the long-running division
15085    and square-root instructions.  */
15086
15087 static bool
15088 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15089 {
15090   switch (get_attr_type (insn))
15091     {
15092       case TYPE_SDIV:
15093       case TYPE_UDIV:
15094       case TYPE_FDIVS:
15095       case TYPE_FDIVD:
15096       case TYPE_FSQRTS:
15097       case TYPE_FSQRTD:
15098       case TYPE_NEON_FP_SQRT_S:
15099       case TYPE_NEON_FP_SQRT_D:
15100       case TYPE_NEON_FP_SQRT_S_Q:
15101       case TYPE_NEON_FP_SQRT_D_Q:
15102       case TYPE_NEON_FP_DIV_S:
15103       case TYPE_NEON_FP_DIV_D:
15104       case TYPE_NEON_FP_DIV_S_Q:
15105       case TYPE_NEON_FP_DIV_D_Q:
15106         return false;
15107       default:
15108         return true;
15109     }
15110 }
15111
15112 /* Target-specific selftests.  */
15113
15114 #if CHECKING_P
15115
15116 namespace selftest {
15117
15118 /* Selftest for the RTL loader.
15119    Verify that the RTL loader copes with a dump from
15120    print_rtx_function.  This is essentially just a test that class
15121    function_reader can handle a real dump, but it also verifies
15122    that lookup_reg_by_dump_name correctly handles hard regs.
15123    The presence of hard reg names in the dump means that the test is
15124    target-specific, hence it is in this file.  */
15125
15126 static void
15127 aarch64_test_loading_full_dump ()
15128 {
15129   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15130
15131   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15132
15133   rtx_insn *insn_1 = get_insn_by_uid (1);
15134   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15135
15136   rtx_insn *insn_15 = get_insn_by_uid (15);
15137   ASSERT_EQ (INSN, GET_CODE (insn_15));
15138   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15139
15140   /* Verify crtl->return_rtx.  */
15141   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15142   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15143   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15144 }
15145
15146 /* Run all target-specific selftests.  */
15147
15148 static void
15149 aarch64_run_selftests (void)
15150 {
15151   aarch64_test_loading_full_dump ();
15152 }
15153
15154 } // namespace selftest
15155
15156 #endif /* #if CHECKING_P */
15157
15158 #undef TARGET_ADDRESS_COST
15159 #define TARGET_ADDRESS_COST aarch64_address_cost
15160
15161 /* This hook will determines whether unnamed bitfields affect the alignment
15162    of the containing structure.  The hook returns true if the structure
15163    should inherit the alignment requirements of an unnamed bitfield's
15164    type.  */
15165 #undef TARGET_ALIGN_ANON_BITFIELD
15166 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15167
15168 #undef TARGET_ASM_ALIGNED_DI_OP
15169 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15170
15171 #undef TARGET_ASM_ALIGNED_HI_OP
15172 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15173
15174 #undef TARGET_ASM_ALIGNED_SI_OP
15175 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15176
15177 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15178 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15179   hook_bool_const_tree_hwi_hwi_const_tree_true
15180
15181 #undef TARGET_ASM_FILE_START
15182 #define TARGET_ASM_FILE_START aarch64_start_file
15183
15184 #undef TARGET_ASM_OUTPUT_MI_THUNK
15185 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15186
15187 #undef TARGET_ASM_SELECT_RTX_SECTION
15188 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15189
15190 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15191 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15192
15193 #undef TARGET_BUILD_BUILTIN_VA_LIST
15194 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15195
15196 #undef TARGET_CALLEE_COPIES
15197 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15198
15199 #undef TARGET_CAN_ELIMINATE
15200 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15201
15202 #undef TARGET_CAN_INLINE_P
15203 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15204
15205 #undef TARGET_CANNOT_FORCE_CONST_MEM
15206 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15207
15208 #undef TARGET_CASE_VALUES_THRESHOLD
15209 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15210
15211 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15212 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15213
15214 /* Only the least significant bit is used for initialization guard
15215    variables.  */
15216 #undef TARGET_CXX_GUARD_MASK_BIT
15217 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15218
15219 #undef TARGET_C_MODE_FOR_SUFFIX
15220 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15221
15222 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15223 #undef  TARGET_DEFAULT_TARGET_FLAGS
15224 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15225 #endif
15226
15227 #undef TARGET_CLASS_MAX_NREGS
15228 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15229
15230 #undef TARGET_BUILTIN_DECL
15231 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15232
15233 #undef TARGET_BUILTIN_RECIPROCAL
15234 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15235
15236 #undef TARGET_C_EXCESS_PRECISION
15237 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15238
15239 #undef  TARGET_EXPAND_BUILTIN
15240 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15241
15242 #undef TARGET_EXPAND_BUILTIN_VA_START
15243 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15244
15245 #undef TARGET_FOLD_BUILTIN
15246 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15247
15248 #undef TARGET_FUNCTION_ARG
15249 #define TARGET_FUNCTION_ARG aarch64_function_arg
15250
15251 #undef TARGET_FUNCTION_ARG_ADVANCE
15252 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15253
15254 #undef TARGET_FUNCTION_ARG_BOUNDARY
15255 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15256
15257 #undef TARGET_FUNCTION_ARG_PADDING
15258 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15259
15260 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15261 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15262
15263 #undef TARGET_FUNCTION_VALUE
15264 #define TARGET_FUNCTION_VALUE aarch64_function_value
15265
15266 #undef TARGET_FUNCTION_VALUE_REGNO_P
15267 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15268
15269 #undef TARGET_GIMPLE_FOLD_BUILTIN
15270 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15271
15272 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15273 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15274
15275 #undef  TARGET_INIT_BUILTINS
15276 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15277
15278 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15279 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15280   aarch64_ira_change_pseudo_allocno_class
15281
15282 #undef TARGET_LEGITIMATE_ADDRESS_P
15283 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15284
15285 #undef TARGET_LEGITIMATE_CONSTANT_P
15286 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15287
15288 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15289 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15290   aarch64_legitimize_address_displacement
15291
15292 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15293 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15294
15295 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15296 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15297 aarch64_libgcc_floating_mode_supported_p
15298
15299 #undef TARGET_MANGLE_TYPE
15300 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15301
15302 #undef TARGET_MEMORY_MOVE_COST
15303 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15304
15305 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15306 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15307
15308 #undef TARGET_MUST_PASS_IN_STACK
15309 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15310
15311 /* This target hook should return true if accesses to volatile bitfields
15312    should use the narrowest mode possible.  It should return false if these
15313    accesses should use the bitfield container type.  */
15314 #undef TARGET_NARROW_VOLATILE_BITFIELD
15315 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15316
15317 #undef  TARGET_OPTION_OVERRIDE
15318 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15319
15320 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15321 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15322   aarch64_override_options_after_change
15323
15324 #undef TARGET_OPTION_SAVE
15325 #define TARGET_OPTION_SAVE aarch64_option_save
15326
15327 #undef TARGET_OPTION_RESTORE
15328 #define TARGET_OPTION_RESTORE aarch64_option_restore
15329
15330 #undef TARGET_OPTION_PRINT
15331 #define TARGET_OPTION_PRINT aarch64_option_print
15332
15333 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15334 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15335
15336 #undef TARGET_SET_CURRENT_FUNCTION
15337 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15338
15339 #undef TARGET_PASS_BY_REFERENCE
15340 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15341
15342 #undef TARGET_PREFERRED_RELOAD_CLASS
15343 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15344
15345 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15346 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15347
15348 #undef TARGET_PROMOTED_TYPE
15349 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15350
15351 #undef TARGET_SECONDARY_RELOAD
15352 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15353
15354 #undef TARGET_SHIFT_TRUNCATION_MASK
15355 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15356
15357 #undef TARGET_SETUP_INCOMING_VARARGS
15358 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15359
15360 #undef TARGET_STRUCT_VALUE_RTX
15361 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15362
15363 #undef TARGET_REGISTER_MOVE_COST
15364 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15365
15366 #undef TARGET_RETURN_IN_MEMORY
15367 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15368
15369 #undef TARGET_RETURN_IN_MSB
15370 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15371
15372 #undef TARGET_RTX_COSTS
15373 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15374
15375 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15376 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15377
15378 #undef TARGET_SCHED_ISSUE_RATE
15379 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15380
15381 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15382 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15383   aarch64_sched_first_cycle_multipass_dfa_lookahead
15384
15385 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15386 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15387   aarch64_first_cycle_multipass_dfa_lookahead_guard
15388
15389 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15390 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15391   aarch64_get_separate_components
15392
15393 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15394 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15395   aarch64_components_for_bb
15396
15397 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15398 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15399   aarch64_disqualify_components
15400
15401 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15402 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15403   aarch64_emit_prologue_components
15404
15405 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15406 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15407   aarch64_emit_epilogue_components
15408
15409 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15410 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15411   aarch64_set_handled_components
15412
15413 #undef TARGET_TRAMPOLINE_INIT
15414 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15415
15416 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15417 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15418
15419 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15420 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15421
15422 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15423 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15424   aarch64_builtin_support_vector_misalignment
15425
15426 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15427 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15428
15429 #undef TARGET_VECTORIZE_ADD_STMT_COST
15430 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15431
15432 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15433 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15434   aarch64_builtin_vectorization_cost
15435
15436 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15437 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15438
15439 #undef TARGET_VECTORIZE_BUILTINS
15440 #define TARGET_VECTORIZE_BUILTINS
15441
15442 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15443 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15444   aarch64_builtin_vectorized_function
15445
15446 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15447 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15448   aarch64_autovectorize_vector_sizes
15449
15450 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15451 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15452   aarch64_atomic_assign_expand_fenv
15453
15454 /* Section anchor support.  */
15455
15456 #undef TARGET_MIN_ANCHOR_OFFSET
15457 #define TARGET_MIN_ANCHOR_OFFSET -256
15458
15459 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15460    byte offset; we can do much more for larger data types, but have no way
15461    to determine the size of the access.  We assume accesses are aligned.  */
15462 #undef TARGET_MAX_ANCHOR_OFFSET
15463 #define TARGET_MAX_ANCHOR_OFFSET 4095
15464
15465 #undef TARGET_VECTOR_ALIGNMENT
15466 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15467
15468 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15469 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15470   aarch64_simd_vector_alignment_reachable
15471
15472 /* vec_perm support.  */
15473
15474 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15475 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15476   aarch64_vectorize_vec_perm_const_ok
15477
15478 #undef TARGET_INIT_LIBFUNCS
15479 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15480
15481 #undef TARGET_FIXED_CONDITION_CODE_REGS
15482 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15483
15484 #undef TARGET_FLAGS_REGNUM
15485 #define TARGET_FLAGS_REGNUM CC_REGNUM
15486
15487 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15488 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15489
15490 #undef TARGET_ASAN_SHADOW_OFFSET
15491 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15492
15493 #undef TARGET_LEGITIMIZE_ADDRESS
15494 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15495
15496 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15497 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15498
15499 #undef TARGET_CAN_USE_DOLOOP_P
15500 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15501
15502 #undef TARGET_SCHED_ADJUST_PRIORITY
15503 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15504
15505 #undef TARGET_SCHED_MACRO_FUSION_P
15506 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15507
15508 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15509 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15510
15511 #undef TARGET_SCHED_FUSION_PRIORITY
15512 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15513
15514 #undef TARGET_UNSPEC_MAY_TRAP_P
15515 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15516
15517 #undef TARGET_USE_PSEUDO_PIC_REG
15518 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15519
15520 #undef TARGET_PRINT_OPERAND
15521 #define TARGET_PRINT_OPERAND aarch64_print_operand
15522
15523 #undef TARGET_PRINT_OPERAND_ADDRESS
15524 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15525
15526 #undef TARGET_OPTAB_SUPPORTED_P
15527 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15528
15529 #undef TARGET_OMIT_STRUCT_RETURN_REG
15530 #define TARGET_OMIT_STRUCT_RETURN_REG true
15531
15532 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15533 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15534 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15535
15536 #undef TARGET_HARD_REGNO_NREGS
15537 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15538 #undef TARGET_HARD_REGNO_MODE_OK
15539 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15540
15541 #undef TARGET_MODES_TIEABLE_P
15542 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15543
15544 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15545 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15546   aarch64_hard_regno_call_part_clobbered
15547
15548 #undef TARGET_CONSTANT_ALIGNMENT
15549 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
15550
15551 #if CHECKING_P
15552 #undef TARGET_RUN_TARGET_SELFTESTS
15553 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15554 #endif /* #if CHECKING_P */
15555
15556 struct gcc_target targetm = TARGET_INITIALIZER;
15557
15558 #include "gt-aarch64.h"