gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode,
 145                                                  vec_perm_indices);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 static const struct tune_params thunderx2t99_tunings =
 826 {
 827   &thunderx2t99_extra_costs,
 828   &thunderx2t99_addrcost_table,
 829   &thunderx2t99_regmove_cost,
 830   &thunderx2t99_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   4, /* memmov_cost.  */
 834   4, /* issue_rate.  */
 835   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 836    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 837   16,   /* function_align.  */
 838   8,    /* jump_align.  */
 839   16,   /* loop_align.  */
 840   3,    /* int_reassoc_width.  */
 841   2,    /* fp_reassoc_width.  */
 842   2,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &thunderx2t99_prefetch_tune
 849 };
 850
 851 /* Support for fine-grained override of the tuning structures.  */
 852 struct aarch64_tuning_override_function
 853 {
 854   const char* name;
 855   void (*parse_override)(const char*, struct tune_params*);
 856 };
 857
 858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 860
 861 static const struct aarch64_tuning_override_function
 862 aarch64_tuning_override_functions[] =
 863 {
 864   { "fuse", aarch64_parse_fuse_string },
 865   { "tune", aarch64_parse_tune_string },
 866   { NULL, NULL }
 867 };
 868
 869 /* A processor implementing AArch64.  */
 870 struct processor
 871 {
 872   const char *const name;
 873   enum aarch64_processor ident;
 874   enum aarch64_processor sched_core;
 875   enum aarch64_arch arch;
 876   unsigned architecture_version;
 877   const unsigned long flags;
 878   const struct tune_params *const tune;
 879 };
 880
 881 /* Architectures implementing AArch64.  */
 882 static const struct processor all_architectures[] =
 883 {
 884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 885   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 886 #include "aarch64-arches.def"
 887   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 888 };
 889
 890 /* Processor cores implementing AArch64.  */
 891 static const struct processor all_cores[] =
 892 {
 893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 894   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 895   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 896   FLAGS, &COSTS##_tunings},
 897 #include "aarch64-cores.def"
 898   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 899     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 900   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 901 };
 902
 903
 904 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 905    handling code or by target attributes.  */
 906 static const struct processor *selected_arch;
 907 static const struct processor *selected_cpu;
 908 static const struct processor *selected_tune;
 909
 910 /* The current tuning set.  */
 911 struct tune_params aarch64_tune_params = generic_tunings;
 912
 913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 914
 915 /* An ISA extension in the co-processor and main instruction set space.  */
 916 struct aarch64_option_extension
 917 {
 918   const char *const name;
 919   const unsigned long flags_on;
 920   const unsigned long flags_off;
 921 };
 922
 923 typedef enum aarch64_cond_code
 924 {
 925   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 926   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 927   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 928 }
 929 aarch64_cc;
 930
 931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 932
 933 /* The condition codes of the processor, and the inverse function.  */
 934 static const char * const aarch64_condition_codes[] =
 935 {
 936   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 937   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 938 };
 939
 940 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 941 const char *
 942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 943                         const char * branch_format)
 944 {
 945     rtx_code_label * tmp_label = gen_label_rtx ();
 946     char label_buf[256];
 947     char buffer[128];
 948     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 949                                  CODE_LABEL_NUMBER (tmp_label));
 950     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 951     rtx dest_label = operands[pos_label];
 952     operands[pos_label] = tmp_label;
 953
 954     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 955     output_asm_insn (buffer, operands);
 956
 957     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 958     operands[pos_label] = dest_label;
 959     output_asm_insn (buffer, operands);
 960     return "";
 961 }
 962
 963 void
 964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 965 {
 966   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 967   if (TARGET_GENERAL_REGS_ONLY)
 968     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 969   else
 970     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 971 }
 972
 973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 974    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 975    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 976    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 977    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 978    irrespectively of its cost results in bad allocations with many redundant
 979    int<->FP moves which are expensive on various cores.
 980    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 981    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 982    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 983    Otherwise set the allocno class depending on the mode.
 984    The result of this is that it is no longer inefficient to have a higher
 985    memory move cost than the register move cost.
 986 */
 987
 988 static reg_class_t
 989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 990                                          reg_class_t best_class)
 991 {
 992   machine_mode mode;
 993
 994   if (allocno_class != ALL_REGS)
 995     return allocno_class;
 996
 997   if (best_class != ALL_REGS)
 998     return best_class;
 999
1000   mode = PSEUDO_REGNO_MODE (regno);
1001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007   if (GET_MODE_UNIT_SIZE (mode) == 4)
1008     return aarch64_tune_params.min_div_recip_mul_sf;
1009   return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014                              machine_mode mode)
1015 {
1016   if (VECTOR_MODE_P (mode))
1017     return aarch64_tune_params.vec_reassoc_width;
1018   if (INTEGRAL_MODE_P (mode))
1019     return aarch64_tune_params.int_reassoc_width;
1020   if (FLOAT_MODE_P (mode))
1021     return aarch64_tune_params.fp_reassoc_width;
1022   return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029    if (GP_REGNUM_P (regno))
1030      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031    else if (regno == SP_REGNUM)
1032      return AARCH64_DWARF_SP;
1033    else if (FP_REGNUM_P (regno))
1034      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037       equivalent DWARF register.  */
1038    return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes.  */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045   return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes.  */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052   return aarch64_vector_mode_supported_p (mode)
1053          || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059                                 unsigned HOST_WIDE_INT nelems)
1060 {
1061   if (TARGET_SIMD
1062       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064       && (nelems >= 2 && nelems <= 4))
1065     return true;
1066
1067   return false;
1068 }
1069
1070 /* Implement TARGET_HARD_REGNO_NREGS.  */
1071
1072 static unsigned int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075   switch (aarch64_regno_regclass (regno))
1076     {
1077     case FP_REGS:
1078     case FP_LO_REGS:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080     default:
1081       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082     }
1083   gcc_unreachable ();
1084 }
1085
1086 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1087
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091   if (GET_MODE_CLASS (mode) == MODE_CC)
1092     return regno == CC_REGNUM;
1093
1094   if (regno == SP_REGNUM)
1095     /* The purpose of comparing with ptr_mode is to support the
1096        global register variable associated with the stack pointer
1097        register via the syntax of asm ("wsp") in ILP32.  */
1098     return mode == Pmode || mode == ptr_mode;
1099
1100   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101     return mode == Pmode;
1102
1103   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104     return true;
1105
1106   if (FP_REGNUM_P (regno))
1107     {
1108       if (aarch64_vect_struct_mode_p (mode))
1109         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1110       else
1111         return true;
1112     }
1113
1114   return false;
1115 }
1116
1117 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1118    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1119    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1120
1121 static bool
1122 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1123 {
1124   return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1125 }
1126
1127 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1128 machine_mode
1129 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1130                                      machine_mode mode)
1131 {
1132   /* Handle modes that fit within single registers.  */
1133   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1134     {
1135       if (GET_MODE_SIZE (mode) >= 4)
1136         return mode;
1137       else
1138         return SImode;
1139     }
1140   /* Fall back to generic for multi-reg and very large modes.  */
1141   else
1142     return choose_hard_reg_mode (regno, nregs, false);
1143 }
1144
1145 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1146    that strcpy from constants will be faster.  */
1147
1148 static HOST_WIDE_INT
1149 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1150 {
1151   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1152     return MAX (align, BITS_PER_WORD);
1153   return align;
1154 }
1155
1156 /* Return true if calls to DECL should be treated as
1157    long-calls (ie called via a register).  */
1158 static bool
1159 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1160 {
1161   return false;
1162 }
1163
1164 /* Return true if calls to symbol-ref SYM should be treated as
1165    long-calls (ie called via a register).  */
1166 bool
1167 aarch64_is_long_call_p (rtx sym)
1168 {
1169   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1170 }
1171
1172 /* Return true if calls to symbol-ref SYM should not go through
1173    plt stubs.  */
1174
1175 bool
1176 aarch64_is_noplt_call_p (rtx sym)
1177 {
1178   const_tree decl = SYMBOL_REF_DECL (sym);
1179
1180   if (flag_pic
1181       && decl
1182       && (!flag_plt
1183           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1184       && !targetm.binds_local_p (decl))
1185     return true;
1186
1187   return false;
1188 }
1189
1190 /* Return true if the offsets to a zero/sign-extract operation
1191    represent an expression that matches an extend operation.  The
1192    operands represent the paramters from
1193
1194    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1195 bool
1196 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1197                                 rtx extract_imm)
1198 {
1199   HOST_WIDE_INT mult_val, extract_val;
1200
1201   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1202     return false;
1203
1204   mult_val = INTVAL (mult_imm);
1205   extract_val = INTVAL (extract_imm);
1206
1207   if (extract_val > 8
1208       && extract_val < GET_MODE_BITSIZE (mode)
1209       && exact_log2 (extract_val & ~7) > 0
1210       && (extract_val & 7) <= 4
1211       && mult_val == (1 << (extract_val & 7)))
1212     return true;
1213
1214   return false;
1215 }
1216
1217 /* Emit an insn that's a simple single-set.  Both the operands must be
1218    known to be valid.  */
1219 inline static rtx_insn *
1220 emit_set_insn (rtx x, rtx y)
1221 {
1222   return emit_insn (gen_rtx_SET (x, y));
1223 }
1224
1225 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1226    return the rtx for register 0 in the proper mode.  */
1227 rtx
1228 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1229 {
1230   machine_mode mode = SELECT_CC_MODE (code, x, y);
1231   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1232
1233   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1234   return cc_reg;
1235 }
1236
1237 /* Build the SYMBOL_REF for __tls_get_addr.  */
1238
1239 static GTY(()) rtx tls_get_addr_libfunc;
1240
1241 rtx
1242 aarch64_tls_get_addr (void)
1243 {
1244   if (!tls_get_addr_libfunc)
1245     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1246   return tls_get_addr_libfunc;
1247 }
1248
1249 /* Return the TLS model to use for ADDR.  */
1250
1251 static enum tls_model
1252 tls_symbolic_operand_type (rtx addr)
1253 {
1254   enum tls_model tls_kind = TLS_MODEL_NONE;
1255   rtx sym, addend;
1256
1257   if (GET_CODE (addr) == CONST)
1258     {
1259       split_const (addr, &sym, &addend);
1260       if (GET_CODE (sym) == SYMBOL_REF)
1261         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1262     }
1263   else if (GET_CODE (addr) == SYMBOL_REF)
1264     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1265
1266   return tls_kind;
1267 }
1268
1269 /* We'll allow lo_sum's in addresses in our legitimate addresses
1270    so that combine would take care of combining addresses where
1271    necessary, but for generation purposes, we'll generate the address
1272    as :
1273    RTL                               Absolute
1274    tmp = hi (symbol_ref);            adrp  x1, foo
1275    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1276                                      nop
1277
1278    PIC                               TLS
1279    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1280    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1281                                      bl   __tls_get_addr
1282                                      nop
1283
1284    Load TLS symbol, depending on TLS mechanism and TLS access model.
1285
1286    Global Dynamic - Traditional TLS:
1287    adrp tmp, :tlsgd:imm
1288    add  dest, tmp, #:tlsgd_lo12:imm
1289    bl   __tls_get_addr
1290
1291    Global Dynamic - TLS Descriptors:
1292    adrp dest, :tlsdesc:imm
1293    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1294    add  dest, dest, #:tlsdesc_lo12:imm
1295    blr  tmp
1296    mrs  tp, tpidr_el0
1297    add  dest, dest, tp
1298
1299    Initial Exec:
1300    mrs  tp, tpidr_el0
1301    adrp tmp, :gottprel:imm
1302    ldr  dest, [tmp, #:gottprel_lo12:imm]
1303    add  dest, dest, tp
1304
1305    Local Exec:
1306    mrs  tp, tpidr_el0
1307    add  t0, tp, #:tprel_hi12:imm, lsl #12
1308    add  t0, t0, #:tprel_lo12_nc:imm
1309 */
1310
1311 static void
1312 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1313                                    enum aarch64_symbol_type type)
1314 {
1315   switch (type)
1316     {
1317     case SYMBOL_SMALL_ABSOLUTE:
1318       {
1319         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1320         rtx tmp_reg = dest;
1321         machine_mode mode = GET_MODE (dest);
1322
1323         gcc_assert (mode == Pmode || mode == ptr_mode);
1324
1325         if (can_create_pseudo_p ())
1326           tmp_reg = gen_reg_rtx (mode);
1327
1328         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1329         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1330         return;
1331       }
1332
1333     case SYMBOL_TINY_ABSOLUTE:
1334       emit_insn (gen_rtx_SET (dest, imm));
1335       return;
1336
1337     case SYMBOL_SMALL_GOT_28K:
1338       {
1339         machine_mode mode = GET_MODE (dest);
1340         rtx gp_rtx = pic_offset_table_rtx;
1341         rtx insn;
1342         rtx mem;
1343
1344         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1345            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1346            decide rtx costs, in which case pic_offset_table_rtx is not
1347            initialized.  For that case no need to generate the first adrp
1348            instruction as the final cost for global variable access is
1349            one instruction.  */
1350         if (gp_rtx != NULL)
1351           {
1352             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1353                using the page base as GOT base, the first page may be wasted,
1354                in the worst scenario, there is only 28K space for GOT).
1355
1356                The generate instruction sequence for accessing global variable
1357                is:
1358
1359                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1360
1361                Only one instruction needed. But we must initialize
1362                pic_offset_table_rtx properly.  We generate initialize insn for
1363                every global access, and allow CSE to remove all redundant.
1364
1365                The final instruction sequences will look like the following
1366                for multiply global variables access.
1367
1368                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1369
1370                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1371                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1372                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1373                  ...  */
1374
1375             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1376             crtl->uses_pic_offset_table = 1;
1377             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1378
1379             if (mode != GET_MODE (gp_rtx))
1380              gp_rtx = gen_lowpart (mode, gp_rtx);
1381
1382           }
1383
1384         if (mode == ptr_mode)
1385           {
1386             if (mode == DImode)
1387               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1388             else
1389               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1390
1391             mem = XVECEXP (SET_SRC (insn), 0, 0);
1392           }
1393         else
1394           {
1395             gcc_assert (mode == Pmode);
1396
1397             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1398             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1399           }
1400
1401         /* The operand is expected to be MEM.  Whenever the related insn
1402            pattern changed, above code which calculate mem should be
1403            updated.  */
1404         gcc_assert (GET_CODE (mem) == MEM);
1405         MEM_READONLY_P (mem) = 1;
1406         MEM_NOTRAP_P (mem) = 1;
1407         emit_insn (insn);
1408         return;
1409       }
1410
1411     case SYMBOL_SMALL_GOT_4G:
1412       {
1413         /* In ILP32, the mode of dest can be either SImode or DImode,
1414            while the got entry is always of SImode size.  The mode of
1415            dest depends on how dest is used: if dest is assigned to a
1416            pointer (e.g. in the memory), it has SImode; it may have
1417            DImode if dest is dereferenced to access the memeory.
1418            This is why we have to handle three different ldr_got_small
1419            patterns here (two patterns for ILP32).  */
1420
1421         rtx insn;
1422         rtx mem;
1423         rtx tmp_reg = dest;
1424         machine_mode mode = GET_MODE (dest);
1425
1426         if (can_create_pseudo_p ())
1427           tmp_reg = gen_reg_rtx (mode);
1428
1429         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1430         if (mode == ptr_mode)
1431           {
1432             if (mode == DImode)
1433               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1434             else
1435               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1436
1437             mem = XVECEXP (SET_SRC (insn), 0, 0);
1438           }
1439         else
1440           {
1441             gcc_assert (mode == Pmode);
1442
1443             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1444             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1445           }
1446
1447         gcc_assert (GET_CODE (mem) == MEM);
1448         MEM_READONLY_P (mem) = 1;
1449         MEM_NOTRAP_P (mem) = 1;
1450         emit_insn (insn);
1451         return;
1452       }
1453
1454     case SYMBOL_SMALL_TLSGD:
1455       {
1456         rtx_insn *insns;
1457         machine_mode mode = GET_MODE (dest);
1458         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1459
1460         start_sequence ();
1461         if (TARGET_ILP32)
1462           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1463         else
1464           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1465         insns = get_insns ();
1466         end_sequence ();
1467
1468         RTL_CONST_CALL_P (insns) = 1;
1469         emit_libcall_block (insns, dest, result, imm);
1470         return;
1471       }
1472
1473     case SYMBOL_SMALL_TLSDESC:
1474       {
1475         machine_mode mode = GET_MODE (dest);
1476         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1477         rtx tp;
1478
1479         gcc_assert (mode == Pmode || mode == ptr_mode);
1480
1481         /* In ILP32, the got entry is always of SImode size.  Unlike
1482            small GOT, the dest is fixed at reg 0.  */
1483         if (TARGET_ILP32)
1484           emit_insn (gen_tlsdesc_small_si (imm));
1485         else
1486           emit_insn (gen_tlsdesc_small_di (imm));
1487         tp = aarch64_load_tp (NULL);
1488
1489         if (mode != Pmode)
1490           tp = gen_lowpart (mode, tp);
1491
1492         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1493         if (REG_P (dest))
1494           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1495         return;
1496       }
1497
1498     case SYMBOL_SMALL_TLSIE:
1499       {
1500         /* In ILP32, the mode of dest can be either SImode or DImode,
1501            while the got entry is always of SImode size.  The mode of
1502            dest depends on how dest is used: if dest is assigned to a
1503            pointer (e.g. in the memory), it has SImode; it may have
1504            DImode if dest is dereferenced to access the memeory.
1505            This is why we have to handle three different tlsie_small
1506            patterns here (two patterns for ILP32).  */
1507         machine_mode mode = GET_MODE (dest);
1508         rtx tmp_reg = gen_reg_rtx (mode);
1509         rtx tp = aarch64_load_tp (NULL);
1510
1511         if (mode == ptr_mode)
1512           {
1513             if (mode == DImode)
1514               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1515             else
1516               {
1517                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1518                 tp = gen_lowpart (mode, tp);
1519               }
1520           }
1521         else
1522           {
1523             gcc_assert (mode == Pmode);
1524             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1525           }
1526
1527         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1528         if (REG_P (dest))
1529           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1530         return;
1531       }
1532
1533     case SYMBOL_TLSLE12:
1534     case SYMBOL_TLSLE24:
1535     case SYMBOL_TLSLE32:
1536     case SYMBOL_TLSLE48:
1537       {
1538         machine_mode mode = GET_MODE (dest);
1539         rtx tp = aarch64_load_tp (NULL);
1540
1541         if (mode != Pmode)
1542           tp = gen_lowpart (mode, tp);
1543
1544         switch (type)
1545           {
1546           case SYMBOL_TLSLE12:
1547             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1548                         (dest, tp, imm));
1549             break;
1550           case SYMBOL_TLSLE24:
1551             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1552                         (dest, tp, imm));
1553           break;
1554           case SYMBOL_TLSLE32:
1555             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1556                         (dest, imm));
1557             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1558                         (dest, dest, tp));
1559           break;
1560           case SYMBOL_TLSLE48:
1561             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1562                         (dest, imm));
1563             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1564                         (dest, dest, tp));
1565             break;
1566           default:
1567             gcc_unreachable ();
1568           }
1569
1570         if (REG_P (dest))
1571           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1572         return;
1573       }
1574
1575     case SYMBOL_TINY_GOT:
1576       emit_insn (gen_ldr_got_tiny (dest, imm));
1577       return;
1578
1579     case SYMBOL_TINY_TLSIE:
1580       {
1581         machine_mode mode = GET_MODE (dest);
1582         rtx tp = aarch64_load_tp (NULL);
1583
1584         if (mode == ptr_mode)
1585           {
1586             if (mode == DImode)
1587               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1588             else
1589               {
1590                 tp = gen_lowpart (mode, tp);
1591                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1592               }
1593           }
1594         else
1595           {
1596             gcc_assert (mode == Pmode);
1597             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1598           }
1599
1600         if (REG_P (dest))
1601           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1602         return;
1603       }
1604
1605     default:
1606       gcc_unreachable ();
1607     }
1608 }
1609
1610 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1611    handle all moves if !can_create_pseudo_p ().  The distinction is
1612    important because, unlike emit_move_insn, the move expanders know
1613    how to force Pmode objects into the constant pool even when the
1614    constant pool address is not itself legitimate.  */
1615 static rtx
1616 aarch64_emit_move (rtx dest, rtx src)
1617 {
1618   return (can_create_pseudo_p ()
1619           ? emit_move_insn (dest, src)
1620           : emit_move_insn_1 (dest, src));
1621 }
1622
1623 /* Split a 128-bit move operation into two 64-bit move operations,
1624    taking care to handle partial overlap of register to register
1625    copies.  Special cases are needed when moving between GP regs and
1626    FP regs.  SRC can be a register, constant or memory; DST a register
1627    or memory.  If either operand is memory it must not have any side
1628    effects.  */
1629 void
1630 aarch64_split_128bit_move (rtx dst, rtx src)
1631 {
1632   rtx dst_lo, dst_hi;
1633   rtx src_lo, src_hi;
1634
1635   machine_mode mode = GET_MODE (dst);
1636
1637   gcc_assert (mode == TImode || mode == TFmode);
1638   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1639   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1640
1641   if (REG_P (dst) && REG_P (src))
1642     {
1643       int src_regno = REGNO (src);
1644       int dst_regno = REGNO (dst);
1645
1646       /* Handle FP <-> GP regs.  */
1647       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1648         {
1649           src_lo = gen_lowpart (word_mode, src);
1650           src_hi = gen_highpart (word_mode, src);
1651
1652           if (mode == TImode)
1653             {
1654               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1655               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1656             }
1657           else
1658             {
1659               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1660               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1661             }
1662           return;
1663         }
1664       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1665         {
1666           dst_lo = gen_lowpart (word_mode, dst);
1667           dst_hi = gen_highpart (word_mode, dst);
1668
1669           if (mode == TImode)
1670             {
1671               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1672               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1673             }
1674           else
1675             {
1676               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1677               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1678             }
1679           return;
1680         }
1681     }
1682
1683   dst_lo = gen_lowpart (word_mode, dst);
1684   dst_hi = gen_highpart (word_mode, dst);
1685   src_lo = gen_lowpart (word_mode, src);
1686   src_hi = gen_highpart_mode (word_mode, mode, src);
1687
1688   /* At most one pairing may overlap.  */
1689   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1690     {
1691       aarch64_emit_move (dst_hi, src_hi);
1692       aarch64_emit_move (dst_lo, src_lo);
1693     }
1694   else
1695     {
1696       aarch64_emit_move (dst_lo, src_lo);
1697       aarch64_emit_move (dst_hi, src_hi);
1698     }
1699 }
1700
1701 bool
1702 aarch64_split_128bit_move_p (rtx dst, rtx src)
1703 {
1704   return (! REG_P (src)
1705           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1706 }
1707
1708 /* Split a complex SIMD combine.  */
1709
1710 void
1711 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1712 {
1713   machine_mode src_mode = GET_MODE (src1);
1714   machine_mode dst_mode = GET_MODE (dst);
1715
1716   gcc_assert (VECTOR_MODE_P (dst_mode));
1717   gcc_assert (register_operand (dst, dst_mode)
1718               && register_operand (src1, src_mode)
1719               && register_operand (src2, src_mode));
1720
1721   rtx (*gen) (rtx, rtx, rtx);
1722
1723   switch (src_mode)
1724     {
1725     case E_V8QImode:
1726       gen = gen_aarch64_simd_combinev8qi;
1727       break;
1728     case E_V4HImode:
1729       gen = gen_aarch64_simd_combinev4hi;
1730       break;
1731     case E_V2SImode:
1732       gen = gen_aarch64_simd_combinev2si;
1733       break;
1734     case E_V4HFmode:
1735       gen = gen_aarch64_simd_combinev4hf;
1736       break;
1737     case E_V2SFmode:
1738       gen = gen_aarch64_simd_combinev2sf;
1739       break;
1740     case E_DImode:
1741       gen = gen_aarch64_simd_combinedi;
1742       break;
1743     case E_DFmode:
1744       gen = gen_aarch64_simd_combinedf;
1745       break;
1746     default:
1747       gcc_unreachable ();
1748     }
1749
1750   emit_insn (gen (dst, src1, src2));
1751   return;
1752 }
1753
1754 /* Split a complex SIMD move.  */
1755
1756 void
1757 aarch64_split_simd_move (rtx dst, rtx src)
1758 {
1759   machine_mode src_mode = GET_MODE (src);
1760   machine_mode dst_mode = GET_MODE (dst);
1761
1762   gcc_assert (VECTOR_MODE_P (dst_mode));
1763
1764   if (REG_P (dst) && REG_P (src))
1765     {
1766       rtx (*gen) (rtx, rtx);
1767
1768       gcc_assert (VECTOR_MODE_P (src_mode));
1769
1770       switch (src_mode)
1771         {
1772         case E_V16QImode:
1773           gen = gen_aarch64_split_simd_movv16qi;
1774           break;
1775         case E_V8HImode:
1776           gen = gen_aarch64_split_simd_movv8hi;
1777           break;
1778         case E_V4SImode:
1779           gen = gen_aarch64_split_simd_movv4si;
1780           break;
1781         case E_V2DImode:
1782           gen = gen_aarch64_split_simd_movv2di;
1783           break;
1784         case E_V8HFmode:
1785           gen = gen_aarch64_split_simd_movv8hf;
1786           break;
1787         case E_V4SFmode:
1788           gen = gen_aarch64_split_simd_movv4sf;
1789           break;
1790         case E_V2DFmode:
1791           gen = gen_aarch64_split_simd_movv2df;
1792           break;
1793         default:
1794           gcc_unreachable ();
1795         }
1796
1797       emit_insn (gen (dst, src));
1798       return;
1799     }
1800 }
1801
1802 bool
1803 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1804                               machine_mode ymode, rtx y)
1805 {
1806   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1807   gcc_assert (r != NULL);
1808   return rtx_equal_p (x, r);
1809 }
1810
1811
1812 static rtx
1813 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1814 {
1815   if (can_create_pseudo_p ())
1816     return force_reg (mode, value);
1817   else
1818     {
1819       x = aarch64_emit_move (x, value);
1820       return x;
1821     }
1822 }
1823
1824
1825 static rtx
1826 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1827                     HOST_WIDE_INT offset)
1828 {
1829   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1830     {
1831       rtx high;
1832       /* Load the full offset into a register.  This
1833          might be improvable in the future.  */
1834       high = GEN_INT (offset);
1835       offset = 0;
1836       high = aarch64_force_temporary (mode, temp, high);
1837       reg = aarch64_force_temporary (mode, temp,
1838                                      gen_rtx_PLUS (mode, high, reg));
1839     }
1840   return plus_constant (mode, reg, offset);
1841 }
1842
1843 static int
1844 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1845                                 scalar_int_mode mode)
1846 {
1847   int i;
1848   unsigned HOST_WIDE_INT val, val2, mask;
1849   int one_match, zero_match;
1850   int num_insns;
1851
1852   val = INTVAL (imm);
1853
1854   if (aarch64_move_imm (val, mode))
1855     {
1856       if (generate)
1857         emit_insn (gen_rtx_SET (dest, imm));
1858       return 1;
1859     }
1860
1861   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1862      (with XXXX non-zero). In that case check to see if the move can be done in
1863      a smaller mode.  */
1864   val2 = val & 0xffffffff;
1865   if (mode == DImode
1866       && aarch64_move_imm (val2, SImode)
1867       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1868     {
1869       if (generate)
1870         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1871
1872       /* Check if we have to emit a second instruction by checking to see
1873          if any of the upper 32 bits of the original DI mode value is set.  */
1874       if (val == val2)
1875         return 1;
1876
1877       i = (val >> 48) ? 48 : 32;
1878
1879       if (generate)
1880          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1881                                     GEN_INT ((val >> i) & 0xffff)));
1882
1883       return 2;
1884     }
1885
1886   if ((val >> 32) == 0 || mode == SImode)
1887     {
1888       if (generate)
1889         {
1890           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1891           if (mode == SImode)
1892             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1893                                        GEN_INT ((val >> 16) & 0xffff)));
1894           else
1895             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1896                                        GEN_INT ((val >> 16) & 0xffff)));
1897         }
1898       return 2;
1899     }
1900
1901   /* Remaining cases are all for DImode.  */
1902
1903   mask = 0xffff;
1904   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1905     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1906   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1907     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1908
1909   if (zero_match != 2 && one_match != 2)
1910     {
1911       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1912          For a 64-bit bitmask try whether changing 16 bits to all ones or
1913          zeroes creates a valid bitmask.  To check any repeated bitmask,
1914          try using 16 bits from the other 32-bit half of val.  */
1915
1916       for (i = 0; i < 64; i += 16, mask <<= 16)
1917         {
1918           val2 = val & ~mask;
1919           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1920             break;
1921           val2 = val | mask;
1922           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1923             break;
1924           val2 = val2 & ~mask;
1925           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1926           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1927             break;
1928         }
1929       if (i != 64)
1930         {
1931           if (generate)
1932             {
1933               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1934               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1935                                          GEN_INT ((val >> i) & 0xffff)));
1936             }
1937           return 2;
1938         }
1939     }
1940
1941   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1942      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1943      otherwise skip zero bits.  */
1944
1945   num_insns = 1;
1946   mask = 0xffff;
1947   val2 = one_match > zero_match ? ~val : val;
1948   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1949
1950   if (generate)
1951     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1952                                            ? (val | ~(mask << i))
1953                                            : (val & (mask << i)))));
1954   for (i += 16; i < 64; i += 16)
1955     {
1956       if ((val2 & (mask << i)) == 0)
1957         continue;
1958       if (generate)
1959         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1960                                    GEN_INT ((val >> i) & 0xffff)));
1961       num_insns ++;
1962     }
1963
1964   return num_insns;
1965 }
1966
1967
1968 void
1969 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1970 {
1971   machine_mode mode = GET_MODE (dest);
1972
1973   gcc_assert (mode == SImode || mode == DImode);
1974
1975   /* Check on what type of symbol it is.  */
1976   scalar_int_mode int_mode;
1977   if ((GET_CODE (imm) == SYMBOL_REF
1978        || GET_CODE (imm) == LABEL_REF
1979        || GET_CODE (imm) == CONST)
1980       && is_a <scalar_int_mode> (mode, &int_mode))
1981     {
1982       rtx mem, base, offset;
1983       enum aarch64_symbol_type sty;
1984
1985       /* If we have (const (plus symbol offset)), separate out the offset
1986          before we start classifying the symbol.  */
1987       split_const (imm, &base, &offset);
1988
1989       sty = aarch64_classify_symbol (base, offset);
1990       switch (sty)
1991         {
1992         case SYMBOL_FORCE_TO_MEM:
1993           if (offset != const0_rtx
1994               && targetm.cannot_force_const_mem (int_mode, imm))
1995             {
1996               gcc_assert (can_create_pseudo_p ());
1997               base = aarch64_force_temporary (int_mode, dest, base);
1998               base = aarch64_add_offset (int_mode, NULL, base,
1999                                          INTVAL (offset));
2000               aarch64_emit_move (dest, base);
2001               return;
2002             }
2003
2004           mem = force_const_mem (ptr_mode, imm);
2005           gcc_assert (mem);
2006
2007           /* If we aren't generating PC relative literals, then
2008              we need to expand the literal pool access carefully.
2009              This is something that needs to be done in a number
2010              of places, so could well live as a separate function.  */
2011           if (!aarch64_pcrelative_literal_loads)
2012             {
2013               gcc_assert (can_create_pseudo_p ());
2014               base = gen_reg_rtx (ptr_mode);
2015               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2016               if (ptr_mode != Pmode)
2017                 base = convert_memory_address (Pmode, base);
2018               mem = gen_rtx_MEM (ptr_mode, base);
2019             }
2020
2021           if (int_mode != ptr_mode)
2022             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2023
2024           emit_insn (gen_rtx_SET (dest, mem));
2025
2026           return;
2027
2028         case SYMBOL_SMALL_TLSGD:
2029         case SYMBOL_SMALL_TLSDESC:
2030         case SYMBOL_SMALL_TLSIE:
2031         case SYMBOL_SMALL_GOT_28K:
2032         case SYMBOL_SMALL_GOT_4G:
2033         case SYMBOL_TINY_GOT:
2034         case SYMBOL_TINY_TLSIE:
2035           if (offset != const0_rtx)
2036             {
2037               gcc_assert(can_create_pseudo_p ());
2038               base = aarch64_force_temporary (int_mode, dest, base);
2039               base = aarch64_add_offset (int_mode, NULL, base,
2040                                          INTVAL (offset));
2041               aarch64_emit_move (dest, base);
2042               return;
2043             }
2044           /* FALLTHRU */
2045
2046         case SYMBOL_SMALL_ABSOLUTE:
2047         case SYMBOL_TINY_ABSOLUTE:
2048         case SYMBOL_TLSLE12:
2049         case SYMBOL_TLSLE24:
2050         case SYMBOL_TLSLE32:
2051         case SYMBOL_TLSLE48:
2052           aarch64_load_symref_appropriately (dest, imm, sty);
2053           return;
2054
2055         default:
2056           gcc_unreachable ();
2057         }
2058     }
2059
2060   if (!CONST_INT_P (imm))
2061     {
2062       if (GET_CODE (imm) == HIGH)
2063         emit_insn (gen_rtx_SET (dest, imm));
2064       else
2065         {
2066           rtx mem = force_const_mem (mode, imm);
2067           gcc_assert (mem);
2068           emit_insn (gen_rtx_SET (dest, mem));
2069         }
2070
2071       return;
2072     }
2073
2074   aarch64_internal_mov_immediate (dest, imm, true,
2075                                   as_a <scalar_int_mode> (mode));
2076 }
2077
2078 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2079    temporary value if necessary.  FRAME_RELATED_P should be true if
2080    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2081    to the generated instructions.  If SCRATCHREG is known to hold
2082    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2083    immediate again.
2084
2085    Since this function may be used to adjust the stack pointer, we must
2086    ensure that it cannot cause transient stack deallocation (for example
2087    by first incrementing SP and then decrementing when adjusting by a
2088    large immediate).  */
2089
2090 static void
2091 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2092                                int scratchreg, HOST_WIDE_INT delta,
2093                                bool frame_related_p, bool emit_move_imm)
2094 {
2095   HOST_WIDE_INT mdelta = abs_hwi (delta);
2096   rtx this_rtx = gen_rtx_REG (mode, regnum);
2097   rtx_insn *insn;
2098
2099   if (!mdelta)
2100     return;
2101
2102   /* Single instruction adjustment.  */
2103   if (aarch64_uimm12_shift (mdelta))
2104     {
2105       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2106       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2107       return;
2108     }
2109
2110   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2111      Only do this if mdelta is not a 16-bit move as adjusting using a move
2112      is better.  */
2113   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2114     {
2115       HOST_WIDE_INT low_off = mdelta & 0xfff;
2116
2117       low_off = delta < 0 ? -low_off : low_off;
2118       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2119       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2120       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2121       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2122       return;
2123     }
2124
2125   /* Emit a move immediate if required and an addition/subtraction.  */
2126   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2127   if (emit_move_imm)
2128     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2129   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2130                               : gen_add2_insn (this_rtx, scratch_rtx));
2131   if (frame_related_p)
2132     {
2133       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2134       rtx adj = plus_constant (mode, this_rtx, delta);
2135       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2136     }
2137 }
2138
2139 static inline void
2140 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2141                       HOST_WIDE_INT delta)
2142 {
2143   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2144 }
2145
2146 static inline void
2147 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2148 {
2149   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2150                                  true, emit_move_imm);
2151 }
2152
2153 static inline void
2154 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2155 {
2156   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2157                                  frame_related_p, true);
2158 }
2159
2160 static bool
2161 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2162                                  tree exp ATTRIBUTE_UNUSED)
2163 {
2164   /* Currently, always true.  */
2165   return true;
2166 }
2167
2168 /* Implement TARGET_PASS_BY_REFERENCE.  */
2169
2170 static bool
2171 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2172                            machine_mode mode,
2173                            const_tree type,
2174                            bool named ATTRIBUTE_UNUSED)
2175 {
2176   HOST_WIDE_INT size;
2177   machine_mode dummymode;
2178   int nregs;
2179
2180   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2181   size = (mode == BLKmode && type)
2182     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2183
2184   /* Aggregates are passed by reference based on their size.  */
2185   if (type && AGGREGATE_TYPE_P (type))
2186     {
2187       size = int_size_in_bytes (type);
2188     }
2189
2190   /* Variable sized arguments are always returned by reference.  */
2191   if (size < 0)
2192     return true;
2193
2194   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2195   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2196                                                &dummymode, &nregs,
2197                                                NULL))
2198     return false;
2199
2200   /* Arguments which are variable sized or larger than 2 registers are
2201      passed by reference unless they are a homogenous floating point
2202      aggregate.  */
2203   return size > 2 * UNITS_PER_WORD;
2204 }
2205
2206 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2207 static bool
2208 aarch64_return_in_msb (const_tree valtype)
2209 {
2210   machine_mode dummy_mode;
2211   int dummy_int;
2212
2213   /* Never happens in little-endian mode.  */
2214   if (!BYTES_BIG_ENDIAN)
2215     return false;
2216
2217   /* Only composite types smaller than or equal to 16 bytes can
2218      be potentially returned in registers.  */
2219   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2220       || int_size_in_bytes (valtype) <= 0
2221       || int_size_in_bytes (valtype) > 16)
2222     return false;
2223
2224   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2225      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2226      is always passed/returned in the least significant bits of fp/simd
2227      register(s).  */
2228   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2229                                                &dummy_mode, &dummy_int, NULL))
2230     return false;
2231
2232   return true;
2233 }
2234
2235 /* Implement TARGET_FUNCTION_VALUE.
2236    Define how to find the value returned by a function.  */
2237
2238 static rtx
2239 aarch64_function_value (const_tree type, const_tree func,
2240                         bool outgoing ATTRIBUTE_UNUSED)
2241 {
2242   machine_mode mode;
2243   int unsignedp;
2244   int count;
2245   machine_mode ag_mode;
2246
2247   mode = TYPE_MODE (type);
2248   if (INTEGRAL_TYPE_P (type))
2249     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2250
2251   if (aarch64_return_in_msb (type))
2252     {
2253       HOST_WIDE_INT size = int_size_in_bytes (type);
2254
2255       if (size % UNITS_PER_WORD != 0)
2256         {
2257           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2258           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2259         }
2260     }
2261
2262   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2263                                                &ag_mode, &count, NULL))
2264     {
2265       if (!aarch64_composite_type_p (type, mode))
2266         {
2267           gcc_assert (count == 1 && mode == ag_mode);
2268           return gen_rtx_REG (mode, V0_REGNUM);
2269         }
2270       else
2271         {
2272           int i;
2273           rtx par;
2274
2275           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2276           for (i = 0; i < count; i++)
2277             {
2278               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2279               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2280                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2281               XVECEXP (par, 0, i) = tmp;
2282             }
2283           return par;
2284         }
2285     }
2286   else
2287     return gen_rtx_REG (mode, R0_REGNUM);
2288 }
2289
2290 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2291    Return true if REGNO is the number of a hard register in which the values
2292    of called function may come back.  */
2293
2294 static bool
2295 aarch64_function_value_regno_p (const unsigned int regno)
2296 {
2297   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2298      of 16-byte return values are: 128-bit integers and 16-byte small
2299      structures (excluding homogeneous floating-point aggregates).  */
2300   if (regno == R0_REGNUM || regno == R1_REGNUM)
2301     return true;
2302
2303   /* Up to four fp/simd registers can return a function value, e.g. a
2304      homogeneous floating-point aggregate having four members.  */
2305   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2306     return TARGET_FLOAT;
2307
2308   return false;
2309 }
2310
2311 /* Implement TARGET_RETURN_IN_MEMORY.
2312
2313    If the type T of the result of a function is such that
2314      void func (T arg)
2315    would require that arg be passed as a value in a register (or set of
2316    registers) according to the parameter passing rules, then the result
2317    is returned in the same registers as would be used for such an
2318    argument.  */
2319
2320 static bool
2321 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2322 {
2323   HOST_WIDE_INT size;
2324   machine_mode ag_mode;
2325   int count;
2326
2327   if (!AGGREGATE_TYPE_P (type)
2328       && TREE_CODE (type) != COMPLEX_TYPE
2329       && TREE_CODE (type) != VECTOR_TYPE)
2330     /* Simple scalar types always returned in registers.  */
2331     return false;
2332
2333   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2334                                                type,
2335                                                &ag_mode,
2336                                                &count,
2337                                                NULL))
2338     return false;
2339
2340   /* Types larger than 2 registers returned in memory.  */
2341   size = int_size_in_bytes (type);
2342   return (size < 0 || size > 2 * UNITS_PER_WORD);
2343 }
2344
2345 static bool
2346 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2347                                const_tree type, int *nregs)
2348 {
2349   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2350   return aarch64_vfp_is_call_or_return_candidate (mode,
2351                                                   type,
2352                                                   &pcum->aapcs_vfp_rmode,
2353                                                   nregs,
2354                                                   NULL);
2355 }
2356
2357 /* Given MODE and TYPE of a function argument, return the alignment in
2358    bits.  The idea is to suppress any stronger alignment requested by
2359    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2360    This is a helper function for local use only.  */
2361
2362 static unsigned int
2363 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2364 {
2365   if (!type)
2366     return GET_MODE_ALIGNMENT (mode);
2367
2368   if (integer_zerop (TYPE_SIZE (type)))
2369     return 0;
2370
2371   gcc_assert (TYPE_MODE (type) == mode);
2372
2373   if (!AGGREGATE_TYPE_P (type))
2374     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2375
2376   if (TREE_CODE (type) == ARRAY_TYPE)
2377     return TYPE_ALIGN (TREE_TYPE (type));
2378
2379   unsigned int alignment = 0;
2380   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2381     if (TREE_CODE (field) == FIELD_DECL)
2382       alignment = std::max (alignment, DECL_ALIGN (field));
2383
2384   return alignment;
2385 }
2386
2387 /* Layout a function argument according to the AAPCS64 rules.  The rule
2388    numbers refer to the rule numbers in the AAPCS64.  */
2389
2390 static void
2391 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2392                     const_tree type,
2393                     bool named ATTRIBUTE_UNUSED)
2394 {
2395   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2396   int ncrn, nvrn, nregs;
2397   bool allocate_ncrn, allocate_nvrn;
2398   HOST_WIDE_INT size;
2399
2400   /* We need to do this once per argument.  */
2401   if (pcum->aapcs_arg_processed)
2402     return;
2403
2404   pcum->aapcs_arg_processed = true;
2405
2406   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2407   size
2408     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2409                 UNITS_PER_WORD);
2410
2411   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2412   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2413                                                  mode,
2414                                                  type,
2415                                                  &nregs);
2416
2417   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2418      The following code thus handles passing by SIMD/FP registers first.  */
2419
2420   nvrn = pcum->aapcs_nvrn;
2421
2422   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2423      and homogenous short-vector aggregates (HVA).  */
2424   if (allocate_nvrn)
2425     {
2426       if (!TARGET_FLOAT)
2427         aarch64_err_no_fpadvsimd (mode, "argument");
2428
2429       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2430         {
2431           pcum->aapcs_nextnvrn = nvrn + nregs;
2432           if (!aarch64_composite_type_p (type, mode))
2433             {
2434               gcc_assert (nregs == 1);
2435               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2436             }
2437           else
2438             {
2439               rtx par;
2440               int i;
2441               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2442               for (i = 0; i < nregs; i++)
2443                 {
2444                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2445                                          V0_REGNUM + nvrn + i);
2446                   tmp = gen_rtx_EXPR_LIST
2447                     (VOIDmode, tmp,
2448                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2449                   XVECEXP (par, 0, i) = tmp;
2450                 }
2451               pcum->aapcs_reg = par;
2452             }
2453           return;
2454         }
2455       else
2456         {
2457           /* C.3 NSRN is set to 8.  */
2458           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2459           goto on_stack;
2460         }
2461     }
2462
2463   ncrn = pcum->aapcs_ncrn;
2464   nregs = size / UNITS_PER_WORD;
2465
2466   /* C6 - C9.  though the sign and zero extension semantics are
2467      handled elsewhere.  This is the case where the argument fits
2468      entirely general registers.  */
2469   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2470     {
2471
2472       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2473
2474       /* C.8 if the argument has an alignment of 16 then the NGRN is
2475          rounded up to the next even number.  */
2476       if (nregs == 2
2477           && ncrn % 2
2478           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2479              comparison is there because for > 16 * BITS_PER_UNIT
2480              alignment nregs should be > 2 and therefore it should be
2481              passed by reference rather than value.  */
2482           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2483         {
2484           ++ncrn;
2485           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2486         }
2487
2488       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2489          A reg is still generated for it, but the caller should be smart
2490          enough not to use it.  */
2491       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2492         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2493       else
2494         {
2495           rtx par;
2496           int i;
2497
2498           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2499           for (i = 0; i < nregs; i++)
2500             {
2501               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2502               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2503                                        GEN_INT (i * UNITS_PER_WORD));
2504               XVECEXP (par, 0, i) = tmp;
2505             }
2506           pcum->aapcs_reg = par;
2507         }
2508
2509       pcum->aapcs_nextncrn = ncrn + nregs;
2510       return;
2511     }
2512
2513   /* C.11  */
2514   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2515
2516   /* The argument is passed on stack; record the needed number of words for
2517      this argument and align the total size if necessary.  */
2518 on_stack:
2519   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2520
2521   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2522     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2523                                        16 / UNITS_PER_WORD);
2524   return;
2525 }
2526
2527 /* Implement TARGET_FUNCTION_ARG.  */
2528
2529 static rtx
2530 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2531                       const_tree type, bool named)
2532 {
2533   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2534   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2535
2536   if (mode == VOIDmode)
2537     return NULL_RTX;
2538
2539   aarch64_layout_arg (pcum_v, mode, type, named);
2540   return pcum->aapcs_reg;
2541 }
2542
2543 void
2544 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2545                            const_tree fntype ATTRIBUTE_UNUSED,
2546                            rtx libname ATTRIBUTE_UNUSED,
2547                            const_tree fndecl ATTRIBUTE_UNUSED,
2548                            unsigned n_named ATTRIBUTE_UNUSED)
2549 {
2550   pcum->aapcs_ncrn = 0;
2551   pcum->aapcs_nvrn = 0;
2552   pcum->aapcs_nextncrn = 0;
2553   pcum->aapcs_nextnvrn = 0;
2554   pcum->pcs_variant = ARM_PCS_AAPCS64;
2555   pcum->aapcs_reg = NULL_RTX;
2556   pcum->aapcs_arg_processed = false;
2557   pcum->aapcs_stack_words = 0;
2558   pcum->aapcs_stack_size = 0;
2559
2560   if (!TARGET_FLOAT
2561       && fndecl && TREE_PUBLIC (fndecl)
2562       && fntype && fntype != error_mark_node)
2563     {
2564       const_tree type = TREE_TYPE (fntype);
2565       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2566       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2567       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2568                                                    &mode, &nregs, NULL))
2569         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2570     }
2571   return;
2572 }
2573
2574 static void
2575 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2576                               machine_mode mode,
2577                               const_tree type,
2578                               bool named)
2579 {
2580   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2581   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2582     {
2583       aarch64_layout_arg (pcum_v, mode, type, named);
2584       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2585                   != (pcum->aapcs_stack_words != 0));
2586       pcum->aapcs_arg_processed = false;
2587       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2588       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2589       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2590       pcum->aapcs_stack_words = 0;
2591       pcum->aapcs_reg = NULL_RTX;
2592     }
2593 }
2594
2595 bool
2596 aarch64_function_arg_regno_p (unsigned regno)
2597 {
2598   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2599           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2600 }
2601
2602 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2603    PARM_BOUNDARY bits of alignment, but will be given anything up
2604    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2605    that both before and after the layout of each argument, the Next
2606    Stacked Argument Address (NSAA) will have a minimum alignment of
2607    8 bytes.  */
2608
2609 static unsigned int
2610 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2611 {
2612   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2613   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2614 }
2615
2616 /* Implement TARGET_FUNCTION_ARG_PADDING.
2617
2618    Small aggregate types are placed in the lowest memory address.
2619
2620    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2621
2622 static pad_direction
2623 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2624 {
2625   /* On little-endian targets, the least significant byte of every stack
2626      argument is passed at the lowest byte address of the stack slot.  */
2627   if (!BYTES_BIG_ENDIAN)
2628     return PAD_UPWARD;
2629
2630   /* Otherwise, integral, floating-point and pointer types are padded downward:
2631      the least significant byte of a stack argument is passed at the highest
2632      byte address of the stack slot.  */
2633   if (type
2634       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2635          || POINTER_TYPE_P (type))
2636       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2637     return PAD_DOWNWARD;
2638
2639   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2640   return PAD_UPWARD;
2641 }
2642
2643 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2644
2645    It specifies padding for the last (may also be the only)
2646    element of a block move between registers and memory.  If
2647    assuming the block is in the memory, padding upward means that
2648    the last element is padded after its highest significant byte,
2649    while in downward padding, the last element is padded at the
2650    its least significant byte side.
2651
2652    Small aggregates and small complex types are always padded
2653    upwards.
2654
2655    We don't need to worry about homogeneous floating-point or
2656    short-vector aggregates; their move is not affected by the
2657    padding direction determined here.  Regardless of endianness,
2658    each element of such an aggregate is put in the least
2659    significant bits of a fp/simd register.
2660
2661    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2662    register has useful data, and return the opposite if the most
2663    significant byte does.  */
2664
2665 bool
2666 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2667                      bool first ATTRIBUTE_UNUSED)
2668 {
2669
2670   /* Small composite types are always padded upward.  */
2671   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2672     {
2673       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2674                             : GET_MODE_SIZE (mode));
2675       if (size < 2 * UNITS_PER_WORD)
2676         return true;
2677     }
2678
2679   /* Otherwise, use the default padding.  */
2680   return !BYTES_BIG_ENDIAN;
2681 }
2682
2683 static scalar_int_mode
2684 aarch64_libgcc_cmp_return_mode (void)
2685 {
2686   return SImode;
2687 }
2688
2689 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2690
2691 /* We use the 12-bit shifted immediate arithmetic instructions so values
2692    must be multiple of (1 << 12), i.e. 4096.  */
2693 #define ARITH_FACTOR 4096
2694
2695 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2696 #error Cannot use simple address calculation for stack probing
2697 #endif
2698
2699 /* The pair of scratch registers used for stack probing.  */
2700 #define PROBE_STACK_FIRST_REG  9
2701 #define PROBE_STACK_SECOND_REG 10
2702
2703 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2704    inclusive.  These are offsets from the current stack pointer.  */
2705
2706 static void
2707 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2708 {
2709   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2710
2711   /* See the same assertion on PROBE_INTERVAL above.  */
2712   gcc_assert ((first % ARITH_FACTOR) == 0);
2713
2714   /* See if we have a constant small number of probes to generate.  If so,
2715      that's the easy case.  */
2716   if (size <= PROBE_INTERVAL)
2717     {
2718       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2719
2720       emit_set_insn (reg1,
2721                      plus_constant (Pmode,
2722                                     stack_pointer_rtx, -(first + base)));
2723       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2724     }
2725
2726   /* The run-time loop is made up of 8 insns in the generic case while the
2727      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2728   else if (size <= 4 * PROBE_INTERVAL)
2729     {
2730       HOST_WIDE_INT i, rem;
2731
2732       emit_set_insn (reg1,
2733                      plus_constant (Pmode,
2734                                     stack_pointer_rtx,
2735                                     -(first + PROBE_INTERVAL)));
2736       emit_stack_probe (reg1);
2737
2738       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2739          it exceeds SIZE.  If only two probes are needed, this will not
2740          generate any code.  Then probe at FIRST + SIZE.  */
2741       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2742         {
2743           emit_set_insn (reg1,
2744                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2745           emit_stack_probe (reg1);
2746         }
2747
2748       rem = size - (i - PROBE_INTERVAL);
2749       if (rem > 256)
2750         {
2751           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2752
2753           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2754           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2755         }
2756       else
2757         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2758     }
2759
2760   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2761      extra careful with variables wrapping around because we might be at
2762      the very top (or the very bottom) of the address space and we have
2763      to be able to handle this case properly; in particular, we use an
2764      equality test for the loop condition.  */
2765   else
2766     {
2767       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2768
2769       /* Step 1: round SIZE to the previous multiple of the interval.  */
2770
2771       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2772
2773
2774       /* Step 2: compute initial and final value of the loop counter.  */
2775
2776       /* TEST_ADDR = SP + FIRST.  */
2777       emit_set_insn (reg1,
2778                      plus_constant (Pmode, stack_pointer_rtx, -first));
2779
2780       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2781       HOST_WIDE_INT adjustment = - (first + rounded_size);
2782       if (! aarch64_uimm12_shift (adjustment))
2783         {
2784           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2785                                           true, Pmode);
2786           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2787         }
2788       else
2789         {
2790           emit_set_insn (reg2,
2791                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2792         }
2793
2794       /* Step 3: the loop
2795
2796          do
2797            {
2798              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2799              probe at TEST_ADDR
2800            }
2801          while (TEST_ADDR != LAST_ADDR)
2802
2803          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2804          until it is equal to ROUNDED_SIZE.  */
2805
2806       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2807
2808
2809       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2810          that SIZE is equal to ROUNDED_SIZE.  */
2811
2812       if (size != rounded_size)
2813         {
2814           HOST_WIDE_INT rem = size - rounded_size;
2815
2816           if (rem > 256)
2817             {
2818               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2819
2820               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2821               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2822             }
2823           else
2824             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2825         }
2826     }
2827
2828   /* Make sure nothing is scheduled before we are done.  */
2829   emit_insn (gen_blockage ());
2830 }
2831
2832 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2833    absolute addresses.  */
2834
2835 const char *
2836 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2837 {
2838   static int labelno = 0;
2839   char loop_lab[32];
2840   rtx xops[2];
2841
2842   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2843
2844   /* Loop.  */
2845   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2846
2847   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2848   xops[0] = reg1;
2849   xops[1] = GEN_INT (PROBE_INTERVAL);
2850   output_asm_insn ("sub\t%0, %0, %1", xops);
2851
2852   /* Probe at TEST_ADDR.  */
2853   output_asm_insn ("str\txzr, [%0]", xops);
2854
2855   /* Test if TEST_ADDR == LAST_ADDR.  */
2856   xops[1] = reg2;
2857   output_asm_insn ("cmp\t%0, %1", xops);
2858
2859   /* Branch.  */
2860   fputs ("\tb.ne\t", asm_out_file);
2861   assemble_name_raw (asm_out_file, loop_lab);
2862   fputc ('\n', asm_out_file);
2863
2864   return "";
2865 }
2866
2867 static bool
2868 aarch64_frame_pointer_required (void)
2869 {
2870   /* Use the frame pointer if enabled and it is not a leaf function, unless
2871      leaf frame pointer omission is disabled.  If the frame pointer is enabled,
2872      force the frame pointer in leaf functions which use LR.  */
2873   if (flag_omit_frame_pointer == 2
2874       && !(flag_omit_leaf_frame_pointer
2875            && crtl->is_leaf
2876            && !df_regs_ever_live_p (LR_REGNUM)))
2877     return true;
2878
2879   return false;
2880 }
2881
2882 /* Mark the registers that need to be saved by the callee and calculate
2883    the size of the callee-saved registers area and frame record (both FP
2884    and LR may be omitted).  If the function is not a leaf, ensure LR is
2885    saved at the bottom of the callee-save area.  */
2886 static void
2887 aarch64_layout_frame (void)
2888 {
2889   HOST_WIDE_INT offset = 0;
2890   int regno, last_fp_reg = INVALID_REGNUM;
2891
2892   if (reload_completed && cfun->machine->frame.laid_out)
2893     return;
2894
2895   /* Force a frame chain for EH returns so the return address is at FP+8.  */
2896   cfun->machine->frame.emit_frame_chain
2897     = frame_pointer_needed || crtl->calls_eh_return;
2898
2899 #define SLOT_NOT_REQUIRED (-2)
2900 #define SLOT_REQUIRED     (-1)
2901
2902   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2903   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2904
2905   /* First mark all the registers that really need to be saved...  */
2906   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2907     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2908
2909   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2910     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2911
2912   /* ... that includes the eh data registers (if needed)...  */
2913   if (crtl->calls_eh_return)
2914     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2915       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2916         = SLOT_REQUIRED;
2917
2918   /* ... and any callee saved register that dataflow says is live.  */
2919   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2920     if (df_regs_ever_live_p (regno)
2921         && (regno == R30_REGNUM
2922             || !call_used_regs[regno]))
2923       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2924
2925   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2926     if (df_regs_ever_live_p (regno)
2927         && !call_used_regs[regno])
2928       {
2929         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2930         last_fp_reg = regno;
2931       }
2932
2933   if (cfun->machine->frame.emit_frame_chain)
2934     {
2935       /* FP and LR are placed in the linkage record.  */
2936       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2937       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2938       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2939       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2940       offset = 2 * UNITS_PER_WORD;
2941     }
2942   else if (!crtl->is_leaf)
2943     {
2944       /* Ensure LR is saved at the bottom of the callee-saves.  */
2945       cfun->machine->frame.reg_offset[R30_REGNUM] = 0;
2946       cfun->machine->frame.wb_candidate1 = R30_REGNUM;
2947       offset = UNITS_PER_WORD;
2948     }
2949
2950   /* Now assign stack slots for them.  */
2951   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2952     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2953       {
2954         cfun->machine->frame.reg_offset[regno] = offset;
2955         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2956           cfun->machine->frame.wb_candidate1 = regno;
2957         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2958           cfun->machine->frame.wb_candidate2 = regno;
2959         offset += UNITS_PER_WORD;
2960       }
2961
2962   HOST_WIDE_INT max_int_offset = offset;
2963   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2964   bool has_align_gap = offset != max_int_offset;
2965
2966   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2967     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2968       {
2969         /* If there is an alignment gap between integer and fp callee-saves,
2970            allocate the last fp register to it if possible.  */
2971         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2972           {
2973             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2974             break;
2975           }
2976
2977         cfun->machine->frame.reg_offset[regno] = offset;
2978         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2979           cfun->machine->frame.wb_candidate1 = regno;
2980         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2981                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2982           cfun->machine->frame.wb_candidate2 = regno;
2983         offset += UNITS_PER_WORD;
2984       }
2985
2986   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2987
2988   cfun->machine->frame.saved_regs_size = offset;
2989
2990   HOST_WIDE_INT varargs_and_saved_regs_size
2991     = offset + cfun->machine->frame.saved_varargs_size;
2992
2993   cfun->machine->frame.hard_fp_offset
2994     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2995                 STACK_BOUNDARY / BITS_PER_UNIT);
2996
2997   cfun->machine->frame.frame_size
2998     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2999                 + crtl->outgoing_args_size,
3000                 STACK_BOUNDARY / BITS_PER_UNIT);
3001
3002   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
3003
3004   cfun->machine->frame.initial_adjust = 0;
3005   cfun->machine->frame.final_adjust = 0;
3006   cfun->machine->frame.callee_adjust = 0;
3007   cfun->machine->frame.callee_offset = 0;
3008
3009   HOST_WIDE_INT max_push_offset = 0;
3010   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
3011     max_push_offset = 512;
3012   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3013     max_push_offset = 256;
3014
3015   if (cfun->machine->frame.frame_size < max_push_offset
3016       && crtl->outgoing_args_size == 0)
3017     {
3018       /* Simple, small frame with no outgoing arguments:
3019          stp reg1, reg2, [sp, -frame_size]!
3020          stp reg3, reg4, [sp, 16]  */
3021       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3022     }
3023   else if ((crtl->outgoing_args_size
3024             + cfun->machine->frame.saved_regs_size < 512)
3025            && !(cfun->calls_alloca
3026                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3027     {
3028       /* Frame with small outgoing arguments:
3029          sub sp, sp, frame_size
3030          stp reg1, reg2, [sp, outgoing_args_size]
3031          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3032       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3033       cfun->machine->frame.callee_offset
3034         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3035     }
3036   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3037     {
3038       /* Frame with large outgoing arguments but a small local area:
3039          stp reg1, reg2, [sp, -hard_fp_offset]!
3040          stp reg3, reg4, [sp, 16]
3041          sub sp, sp, outgoing_args_size  */
3042       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3043       cfun->machine->frame.final_adjust
3044         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3045     }
3046   else
3047     {
3048       /* Frame with large local area and outgoing arguments using frame pointer:
3049          sub sp, sp, hard_fp_offset
3050          stp x29, x30, [sp, 0]
3051          add x29, sp, 0
3052          stp reg3, reg4, [sp, 16]
3053          sub sp, sp, outgoing_args_size  */
3054       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3055       cfun->machine->frame.final_adjust
3056         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3057     }
3058
3059   cfun->machine->frame.laid_out = true;
3060 }
3061
3062 /* Return true if the register REGNO is saved on entry to
3063    the current function.  */
3064
3065 static bool
3066 aarch64_register_saved_on_entry (int regno)
3067 {
3068   return cfun->machine->frame.reg_offset[regno] >= 0;
3069 }
3070
3071 /* Return the next register up from REGNO up to LIMIT for the callee
3072    to save.  */
3073
3074 static unsigned
3075 aarch64_next_callee_save (unsigned regno, unsigned limit)
3076 {
3077   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3078     regno ++;
3079   return regno;
3080 }
3081
3082 /* Push the register number REGNO of mode MODE to the stack with write-back
3083    adjusting the stack by ADJUSTMENT.  */
3084
3085 static void
3086 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3087                            HOST_WIDE_INT adjustment)
3088  {
3089   rtx base_rtx = stack_pointer_rtx;
3090   rtx insn, reg, mem;
3091
3092   reg = gen_rtx_REG (mode, regno);
3093   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3094                             plus_constant (Pmode, base_rtx, -adjustment));
3095   mem = gen_frame_mem (mode, mem);
3096
3097   insn = emit_move_insn (mem, reg);
3098   RTX_FRAME_RELATED_P (insn) = 1;
3099 }
3100
3101 /* Generate and return an instruction to store the pair of registers
3102    REG and REG2 of mode MODE to location BASE with write-back adjusting
3103    the stack location BASE by ADJUSTMENT.  */
3104
3105 static rtx
3106 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3107                           HOST_WIDE_INT adjustment)
3108 {
3109   switch (mode)
3110     {
3111     case E_DImode:
3112       return gen_storewb_pairdi_di (base, base, reg, reg2,
3113                                     GEN_INT (-adjustment),
3114                                     GEN_INT (UNITS_PER_WORD - adjustment));
3115     case E_DFmode:
3116       return gen_storewb_pairdf_di (base, base, reg, reg2,
3117                                     GEN_INT (-adjustment),
3118                                     GEN_INT (UNITS_PER_WORD - adjustment));
3119     default:
3120       gcc_unreachable ();
3121     }
3122 }
3123
3124 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3125    stack pointer by ADJUSTMENT.  */
3126
3127 static void
3128 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3129 {
3130   rtx_insn *insn;
3131   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3132
3133   if (regno2 == INVALID_REGNUM)
3134     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3135
3136   rtx reg1 = gen_rtx_REG (mode, regno1);
3137   rtx reg2 = gen_rtx_REG (mode, regno2);
3138
3139   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3140                                               reg2, adjustment));
3141   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3142   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3143   RTX_FRAME_RELATED_P (insn) = 1;
3144 }
3145
3146 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3147    adjusting it by ADJUSTMENT afterwards.  */
3148
3149 static rtx
3150 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3151                          HOST_WIDE_INT adjustment)
3152 {
3153   switch (mode)
3154     {
3155     case E_DImode:
3156       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3157                                    GEN_INT (UNITS_PER_WORD));
3158     case E_DFmode:
3159       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3160                                    GEN_INT (UNITS_PER_WORD));
3161     default:
3162       gcc_unreachable ();
3163     }
3164 }
3165
3166 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3167    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3168    into CFI_OPS.  */
3169
3170 static void
3171 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3172                   rtx *cfi_ops)
3173 {
3174   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3175   rtx reg1 = gen_rtx_REG (mode, regno1);
3176
3177   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3178
3179   if (regno2 == INVALID_REGNUM)
3180     {
3181       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3182       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3183       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3184     }
3185   else
3186     {
3187       rtx reg2 = gen_rtx_REG (mode, regno2);
3188       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3189       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3190                                           reg2, adjustment));
3191     }
3192 }
3193
3194 /* Generate and return a store pair instruction of mode MODE to store
3195    register REG1 to MEM1 and register REG2 to MEM2.  */
3196
3197 static rtx
3198 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3199                         rtx reg2)
3200 {
3201   switch (mode)
3202     {
3203     case E_DImode:
3204       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3205
3206     case E_DFmode:
3207       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3208
3209     default:
3210       gcc_unreachable ();
3211     }
3212 }
3213
3214 /* Generate and regurn a load pair isntruction of mode MODE to load register
3215    REG1 from MEM1 and register REG2 from MEM2.  */
3216
3217 static rtx
3218 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3219                        rtx mem2)
3220 {
3221   switch (mode)
3222     {
3223     case E_DImode:
3224       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3225
3226     case E_DFmode:
3227       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3228
3229     default:
3230       gcc_unreachable ();
3231     }
3232 }
3233
3234 /* Return TRUE if return address signing should be enabled for the current
3235    function, otherwise return FALSE.  */
3236
3237 bool
3238 aarch64_return_address_signing_enabled (void)
3239 {
3240   /* This function should only be called after frame laid out.   */
3241   gcc_assert (cfun->machine->frame.laid_out);
3242
3243   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3244      if it's LR is pushed onto stack.  */
3245   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3246           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3247               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3248 }
3249
3250 /* Emit code to save the callee-saved registers from register number START
3251    to LIMIT to the stack at the location starting at offset START_OFFSET,
3252    skipping any write-back candidates if SKIP_WB is true.  */
3253
3254 static void
3255 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3256                            unsigned start, unsigned limit, bool skip_wb)
3257 {
3258   rtx_insn *insn;
3259   unsigned regno;
3260   unsigned regno2;
3261
3262   for (regno = aarch64_next_callee_save (start, limit);
3263        regno <= limit;
3264        regno = aarch64_next_callee_save (regno + 1, limit))
3265     {
3266       rtx reg, mem;
3267       HOST_WIDE_INT offset;
3268
3269       if (skip_wb
3270           && (regno == cfun->machine->frame.wb_candidate1
3271               || regno == cfun->machine->frame.wb_candidate2))
3272         continue;
3273
3274       if (cfun->machine->reg_is_wrapped_separately[regno])
3275        continue;
3276
3277       reg = gen_rtx_REG (mode, regno);
3278       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3279       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3280                                                 offset));
3281
3282       regno2 = aarch64_next_callee_save (regno + 1, limit);
3283
3284       if (regno2 <= limit
3285           && !cfun->machine->reg_is_wrapped_separately[regno2]
3286           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3287               == cfun->machine->frame.reg_offset[regno2]))
3288
3289         {
3290           rtx reg2 = gen_rtx_REG (mode, regno2);
3291           rtx mem2;
3292
3293           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3294           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3295                                                      offset));
3296           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3297                                                     reg2));
3298
3299           /* The first part of a frame-related parallel insn is
3300              always assumed to be relevant to the frame
3301              calculations; subsequent parts, are only
3302              frame-related if explicitly marked.  */
3303           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3304           regno = regno2;
3305         }
3306       else
3307         insn = emit_move_insn (mem, reg);
3308
3309       RTX_FRAME_RELATED_P (insn) = 1;
3310     }
3311 }
3312
3313 /* Emit code to restore the callee registers of mode MODE from register
3314    number START up to and including LIMIT.  Restore from the stack offset
3315    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3316    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3317
3318 static void
3319 aarch64_restore_callee_saves (machine_mode mode,
3320                               HOST_WIDE_INT start_offset, unsigned start,
3321                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3322 {
3323   rtx base_rtx = stack_pointer_rtx;
3324   unsigned regno;
3325   unsigned regno2;
3326   HOST_WIDE_INT offset;
3327
3328   for (regno = aarch64_next_callee_save (start, limit);
3329        regno <= limit;
3330        regno = aarch64_next_callee_save (regno + 1, limit))
3331     {
3332       if (cfun->machine->reg_is_wrapped_separately[regno])
3333        continue;
3334
3335       rtx reg, mem;
3336
3337       if (skip_wb
3338           && (regno == cfun->machine->frame.wb_candidate1
3339               || regno == cfun->machine->frame.wb_candidate2))
3340         continue;
3341
3342       reg = gen_rtx_REG (mode, regno);
3343       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3344       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3345
3346       regno2 = aarch64_next_callee_save (regno + 1, limit);
3347
3348       if (regno2 <= limit
3349           && !cfun->machine->reg_is_wrapped_separately[regno2]
3350           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3351               == cfun->machine->frame.reg_offset[regno2]))
3352         {
3353           rtx reg2 = gen_rtx_REG (mode, regno2);
3354           rtx mem2;
3355
3356           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3357           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3358           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3359
3360           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3361           regno = regno2;
3362         }
3363       else
3364         emit_move_insn (reg, mem);
3365       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3366     }
3367 }
3368
3369 static inline bool
3370 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3371                                HOST_WIDE_INT offset)
3372 {
3373   return offset >= -256 && offset < 256;
3374 }
3375
3376 static inline bool
3377 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3378 {
3379   return (offset >= 0
3380           && offset < 4096 * GET_MODE_SIZE (mode)
3381           && offset % GET_MODE_SIZE (mode) == 0);
3382 }
3383
3384 bool
3385 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3386 {
3387   return (offset >= -64 * GET_MODE_SIZE (mode)
3388           && offset < 64 * GET_MODE_SIZE (mode)
3389           && offset % GET_MODE_SIZE (mode) == 0);
3390 }
3391
3392 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3393
3394 static sbitmap
3395 aarch64_get_separate_components (void)
3396 {
3397   aarch64_layout_frame ();
3398
3399   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3400   bitmap_clear (components);
3401
3402   /* The registers we need saved to the frame.  */
3403   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3404     if (aarch64_register_saved_on_entry (regno))
3405       {
3406         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3407         if (!frame_pointer_needed)
3408           offset += cfun->machine->frame.frame_size
3409                     - cfun->machine->frame.hard_fp_offset;
3410         /* Check that we can access the stack slot of the register with one
3411            direct load with no adjustments needed.  */
3412         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3413           bitmap_set_bit (components, regno);
3414       }
3415
3416   /* Don't mess with the hard frame pointer.  */
3417   if (frame_pointer_needed)
3418     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3419
3420   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3421   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3422   /* If aarch64_layout_frame has chosen registers to store/restore with
3423      writeback don't interfere with them to avoid having to output explicit
3424      stack adjustment instructions.  */
3425   if (reg2 != INVALID_REGNUM)
3426     bitmap_clear_bit (components, reg2);
3427   if (reg1 != INVALID_REGNUM)
3428     bitmap_clear_bit (components, reg1);
3429
3430   bitmap_clear_bit (components, LR_REGNUM);
3431   bitmap_clear_bit (components, SP_REGNUM);
3432
3433   return components;
3434 }
3435
3436 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3437
3438 static sbitmap
3439 aarch64_components_for_bb (basic_block bb)
3440 {
3441   bitmap in = DF_LIVE_IN (bb);
3442   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3443   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3444
3445   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3446   bitmap_clear (components);
3447
3448   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3449   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3450     if ((!call_used_regs[regno])
3451        && (bitmap_bit_p (in, regno)
3452            || bitmap_bit_p (gen, regno)
3453            || bitmap_bit_p (kill, regno)))
3454           bitmap_set_bit (components, regno);
3455
3456   return components;
3457 }
3458
3459 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3460    Nothing to do for aarch64.  */
3461
3462 static void
3463 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3464 {
3465 }
3466
3467 /* Return the next set bit in BMP from START onwards.  Return the total number
3468    of bits in BMP if no set bit is found at or after START.  */
3469
3470 static unsigned int
3471 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3472 {
3473   unsigned int nbits = SBITMAP_SIZE (bmp);
3474   if (start == nbits)
3475     return start;
3476
3477   gcc_assert (start < nbits);
3478   for (unsigned int i = start; i < nbits; i++)
3479     if (bitmap_bit_p (bmp, i))
3480       return i;
3481
3482   return nbits;
3483 }
3484
3485 /* Do the work for aarch64_emit_prologue_components and
3486    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3487    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3488    for these components or the epilogue sequence.  That is, it determines
3489    whether we should emit stores or loads and what kind of CFA notes to attach
3490    to the insns.  Otherwise the logic for the two sequences is very
3491    similar.  */
3492
3493 static void
3494 aarch64_process_components (sbitmap components, bool prologue_p)
3495 {
3496   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3497                              ? HARD_FRAME_POINTER_REGNUM
3498                              : STACK_POINTER_REGNUM);
3499
3500   unsigned last_regno = SBITMAP_SIZE (components);
3501   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3502   rtx_insn *insn = NULL;
3503
3504   while (regno != last_regno)
3505     {
3506       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3507          so DFmode for the vector registers is enough.  */
3508       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3509       rtx reg = gen_rtx_REG (mode, regno);
3510       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3511       if (!frame_pointer_needed)
3512         offset += cfun->machine->frame.frame_size
3513                   - cfun->machine->frame.hard_fp_offset;
3514       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3515       rtx mem = gen_frame_mem (mode, addr);
3516
3517       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3518       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3519       /* No more registers to handle after REGNO.
3520          Emit a single save/restore and exit.  */
3521       if (regno2 == last_regno)
3522         {
3523           insn = emit_insn (set);
3524           RTX_FRAME_RELATED_P (insn) = 1;
3525           if (prologue_p)
3526             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3527           else
3528             add_reg_note (insn, REG_CFA_RESTORE, reg);
3529           break;
3530         }
3531
3532       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3533       /* The next register is not of the same class or its offset is not
3534          mergeable with the current one into a pair.  */
3535       if (!satisfies_constraint_Ump (mem)
3536           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3537           || (offset2 - cfun->machine->frame.reg_offset[regno])
3538                 != GET_MODE_SIZE (mode))
3539         {
3540           insn = emit_insn (set);
3541           RTX_FRAME_RELATED_P (insn) = 1;
3542           if (prologue_p)
3543             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3544           else
3545             add_reg_note (insn, REG_CFA_RESTORE, reg);
3546
3547           regno = regno2;
3548           continue;
3549         }
3550
3551       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3552       rtx reg2 = gen_rtx_REG (mode, regno2);
3553       if (!frame_pointer_needed)
3554         offset2 += cfun->machine->frame.frame_size
3555                   - cfun->machine->frame.hard_fp_offset;
3556       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3557       rtx mem2 = gen_frame_mem (mode, addr2);
3558       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3559                              : gen_rtx_SET (reg2, mem2);
3560
3561       if (prologue_p)
3562         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3563       else
3564         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3565
3566       RTX_FRAME_RELATED_P (insn) = 1;
3567       if (prologue_p)
3568         {
3569           add_reg_note (insn, REG_CFA_OFFSET, set);
3570           add_reg_note (insn, REG_CFA_OFFSET, set2);
3571         }
3572       else
3573         {
3574           add_reg_note (insn, REG_CFA_RESTORE, reg);
3575           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3576         }
3577
3578       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3579     }
3580 }
3581
3582 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3583
3584 static void
3585 aarch64_emit_prologue_components (sbitmap components)
3586 {
3587   aarch64_process_components (components, true);
3588 }
3589
3590 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3591
3592 static void
3593 aarch64_emit_epilogue_components (sbitmap components)
3594 {
3595   aarch64_process_components (components, false);
3596 }
3597
3598 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3599
3600 static void
3601 aarch64_set_handled_components (sbitmap components)
3602 {
3603   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3604     if (bitmap_bit_p (components, regno))
3605       cfun->machine->reg_is_wrapped_separately[regno] = true;
3606 }
3607
3608 /* AArch64 stack frames generated by this compiler look like:
3609
3610         +-------------------------------+
3611         |                               |
3612         |  incoming stack arguments     |
3613         |                               |
3614         +-------------------------------+
3615         |                               | <-- incoming stack pointer (aligned)
3616         |  callee-allocated save area   |
3617         |  for register varargs         |
3618         |                               |
3619         +-------------------------------+
3620         |  local variables              | <-- frame_pointer_rtx
3621         |                               |
3622         +-------------------------------+
3623         |  padding0                     | \
3624         +-------------------------------+  |
3625         |  callee-saved registers       |  | frame.saved_regs_size
3626         +-------------------------------+  |
3627         |  LR'                          |  |
3628         +-------------------------------+  |
3629         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3630         +-------------------------------+
3631         |  dynamic allocation           |
3632         +-------------------------------+
3633         |  padding                      |
3634         +-------------------------------+
3635         |  outgoing stack arguments     | <-- arg_pointer
3636         |                               |
3637         +-------------------------------+
3638         |                               | <-- stack_pointer_rtx (aligned)
3639
3640    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3641    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3642    unchanged.  */
3643
3644 /* Generate the prologue instructions for entry into a function.
3645    Establish the stack frame by decreasing the stack pointer with a
3646    properly calculated size and, if necessary, create a frame record
3647    filled with the values of LR and previous frame pointer.  The
3648    current FP is also set up if it is in use.  */
3649
3650 void
3651 aarch64_expand_prologue (void)
3652 {
3653   aarch64_layout_frame ();
3654
3655   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3656   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3657   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3658   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3659   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3660   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3661   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3662   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
3663   rtx_insn *insn;
3664
3665   /* Sign return address for functions.  */
3666   if (aarch64_return_address_signing_enabled ())
3667     {
3668       insn = emit_insn (gen_pacisp ());
3669       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3670       RTX_FRAME_RELATED_P (insn) = 1;
3671     }
3672
3673   if (flag_stack_usage_info)
3674     current_function_static_stack_size = frame_size;
3675
3676   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3677     {
3678       if (crtl->is_leaf && !cfun->calls_alloca)
3679         {
3680           if (frame_size > PROBE_INTERVAL
3681               && frame_size > get_stack_check_protect ())
3682             aarch64_emit_probe_stack_range (get_stack_check_protect (),
3683                                             (frame_size
3684                                              - get_stack_check_protect ()));
3685         }
3686       else if (frame_size > 0)
3687         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
3688     }
3689
3690   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3691
3692   if (callee_adjust != 0)
3693     aarch64_push_regs (reg1, reg2, callee_adjust);
3694
3695   if (emit_frame_chain)
3696     {
3697       if (callee_adjust == 0)
3698         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3699                                    R30_REGNUM, false);
3700       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3701                                        stack_pointer_rtx,
3702                                        GEN_INT (callee_offset)));
3703       RTX_FRAME_RELATED_P (insn) = frame_pointer_needed;
3704       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3705     }
3706
3707   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3708                              callee_adjust != 0 || emit_frame_chain);
3709   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3710                              callee_adjust != 0 || emit_frame_chain);
3711   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3712 }
3713
3714 /* Return TRUE if we can use a simple_return insn.
3715
3716    This function checks whether the callee saved stack is empty, which
3717    means no restore actions are need. The pro_and_epilogue will use
3718    this to check whether shrink-wrapping opt is feasible.  */
3719
3720 bool
3721 aarch64_use_return_insn_p (void)
3722 {
3723   if (!reload_completed)
3724     return false;
3725
3726   if (crtl->profile)
3727     return false;
3728
3729   aarch64_layout_frame ();
3730
3731   return cfun->machine->frame.frame_size == 0;
3732 }
3733
3734 /* Generate the epilogue instructions for returning from a function.
3735    This is almost exactly the reverse of the prolog sequence, except
3736    that we need to insert barriers to avoid scheduling loads that read
3737    from a deallocated stack, and we optimize the unwind records by
3738    emitting them all together if possible.  */
3739 void
3740 aarch64_expand_epilogue (bool for_sibcall)
3741 {
3742   aarch64_layout_frame ();
3743
3744   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3745   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3746   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3747   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3748   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3749   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3750   rtx cfi_ops = NULL;
3751   rtx_insn *insn;
3752
3753   /* We need to add memory barrier to prevent read from deallocated stack.  */
3754   bool need_barrier_p = (get_frame_size ()
3755                          + cfun->machine->frame.saved_varargs_size) != 0;
3756
3757   /* Emit a barrier to prevent loads from a deallocated stack.  */
3758   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3759       || crtl->calls_eh_return)
3760     {
3761       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3762       need_barrier_p = false;
3763     }
3764
3765   /* Restore the stack pointer from the frame pointer if it may not
3766      be the same as the stack pointer.  */
3767   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3768     {
3769       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3770                                        hard_frame_pointer_rtx,
3771                                        GEN_INT (-callee_offset)));
3772       /* If writeback is used when restoring callee-saves, the CFA
3773          is restored on the instruction doing the writeback.  */
3774       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3775     }
3776   else
3777     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3778
3779   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3780                                 callee_adjust != 0, &cfi_ops);
3781   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3782                                 callee_adjust != 0, &cfi_ops);
3783
3784   if (need_barrier_p)
3785     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3786
3787   if (callee_adjust != 0)
3788     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3789
3790   if (callee_adjust != 0 || initial_adjust > 65536)
3791     {
3792       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3793       insn = get_last_insn ();
3794       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3795       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3796       RTX_FRAME_RELATED_P (insn) = 1;
3797       cfi_ops = NULL;
3798     }
3799
3800   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3801
3802   if (cfi_ops)
3803     {
3804       /* Emit delayed restores and reset the CFA to be SP.  */
3805       insn = get_last_insn ();
3806       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3807       REG_NOTES (insn) = cfi_ops;
3808       RTX_FRAME_RELATED_P (insn) = 1;
3809     }
3810
3811   /* We prefer to emit the combined return/authenticate instruction RETAA,
3812      however there are three cases in which we must instead emit an explicit
3813      authentication instruction.
3814
3815         1) Sibcalls don't return in a normal way, so if we're about to call one
3816            we must authenticate.
3817
3818         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3819            generating code for !TARGET_ARMV8_3 we can't use it and must
3820            explicitly authenticate.
3821
3822         3) On an eh_return path we make extra stack adjustments to update the
3823            canonical frame address to be the exception handler's CFA.  We want
3824            to authenticate using the CFA of the function which calls eh_return.
3825     */
3826   if (aarch64_return_address_signing_enabled ()
3827       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3828     {
3829       insn = emit_insn (gen_autisp ());
3830       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3831       RTX_FRAME_RELATED_P (insn) = 1;
3832     }
3833
3834   /* Stack adjustment for exception handler.  */
3835   if (crtl->calls_eh_return)
3836     {
3837       /* We need to unwind the stack by the offset computed by
3838          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3839          to be SP; letting the CFA move during this adjustment
3840          is just as correct as retaining the CFA from the body
3841          of the function.  Therefore, do nothing special.  */
3842       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3843     }
3844
3845   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3846   if (!for_sibcall)
3847     emit_jump_insn (ret_rtx);
3848 }
3849
3850 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3851    normally or return to a previous frame after unwinding.
3852
3853    An EH return uses a single shared return sequence.  The epilogue is
3854    exactly like a normal epilogue except that it has an extra input
3855    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3856    that must be applied after the frame has been destroyed.  An extra label
3857    is inserted before the epilogue which initializes this register to zero,
3858    and this is the entry point for a normal return.
3859
3860    An actual EH return updates the return address, initializes the stack
3861    adjustment and jumps directly into the epilogue (bypassing the zeroing
3862    of the adjustment).  Since the return address is typically saved on the
3863    stack when a function makes a call, the saved LR must be updated outside
3864    the epilogue.
3865
3866    This poses problems as the store is generated well before the epilogue,
3867    so the offset of LR is not known yet.  Also optimizations will remove the
3868    store as it appears dead, even after the epilogue is generated (as the
3869    base or offset for loading LR is different in many cases).
3870
3871    To avoid these problems this implementation forces the frame pointer
3872    in eh_return functions so that the location of LR is fixed and known early.
3873    It also marks the store volatile, so no optimization is permitted to
3874    remove the store.  */
3875 rtx
3876 aarch64_eh_return_handler_rtx (void)
3877 {
3878   rtx tmp = gen_frame_mem (Pmode,
3879     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3880
3881   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3882   MEM_VOLATILE_P (tmp) = true;
3883   return tmp;
3884 }
3885
3886 /* Output code to add DELTA to the first argument, and then jump
3887    to FUNCTION.  Used for C++ multiple inheritance.  */
3888 static void
3889 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3890                          HOST_WIDE_INT delta,
3891                          HOST_WIDE_INT vcall_offset,
3892                          tree function)
3893 {
3894   /* The this pointer is always in x0.  Note that this differs from
3895      Arm where the this pointer maybe bumped to r1 if r0 is required
3896      to return a pointer to an aggregate.  On AArch64 a result value
3897      pointer will be in x8.  */
3898   int this_regno = R0_REGNUM;
3899   rtx this_rtx, temp0, temp1, addr, funexp;
3900   rtx_insn *insn;
3901
3902   reload_completed = 1;
3903   emit_note (NOTE_INSN_PROLOGUE_END);
3904
3905   if (vcall_offset == 0)
3906     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3907   else
3908     {
3909       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3910
3911       this_rtx = gen_rtx_REG (Pmode, this_regno);
3912       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3913       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3914
3915       addr = this_rtx;
3916       if (delta != 0)
3917         {
3918           if (delta >= -256 && delta < 256)
3919             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3920                                        plus_constant (Pmode, this_rtx, delta));
3921           else
3922             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3923         }
3924
3925       if (Pmode == ptr_mode)
3926         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3927       else
3928         aarch64_emit_move (temp0,
3929                            gen_rtx_ZERO_EXTEND (Pmode,
3930                                                 gen_rtx_MEM (ptr_mode, addr)));
3931
3932       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3933           addr = plus_constant (Pmode, temp0, vcall_offset);
3934       else
3935         {
3936           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3937                                           Pmode);
3938           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3939         }
3940
3941       if (Pmode == ptr_mode)
3942         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3943       else
3944         aarch64_emit_move (temp1,
3945                            gen_rtx_SIGN_EXTEND (Pmode,
3946                                                 gen_rtx_MEM (ptr_mode, addr)));
3947
3948       emit_insn (gen_add2_insn (this_rtx, temp1));
3949     }
3950
3951   /* Generate a tail call to the target function.  */
3952   if (!TREE_USED (function))
3953     {
3954       assemble_external (function);
3955       TREE_USED (function) = 1;
3956     }
3957   funexp = XEXP (DECL_RTL (function), 0);
3958   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3959   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3960   SIBLING_CALL_P (insn) = 1;
3961
3962   insn = get_insns ();
3963   shorten_branches (insn);
3964   final_start_function (insn, file, 1);
3965   final (insn, file, 1);
3966   final_end_function ();
3967
3968   /* Stop pretending to be a post-reload pass.  */
3969   reload_completed = 0;
3970 }
3971
3972 static bool
3973 aarch64_tls_referenced_p (rtx x)
3974 {
3975   if (!TARGET_HAVE_TLS)
3976     return false;
3977   subrtx_iterator::array_type array;
3978   FOR_EACH_SUBRTX (iter, array, x, ALL)
3979     {
3980       const_rtx x = *iter;
3981       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3982         return true;
3983       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3984          TLS offsets, not real symbol references.  */
3985       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3986         iter.skip_subrtxes ();
3987     }
3988   return false;
3989 }
3990
3991
3992 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3993    a left shift of 0 or 12 bits.  */
3994 bool
3995 aarch64_uimm12_shift (HOST_WIDE_INT val)
3996 {
3997   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3998           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3999           );
4000 }
4001
4002
4003 /* Return true if val is an immediate that can be loaded into a
4004    register by a MOVZ instruction.  */
4005 static bool
4006 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
4007 {
4008   if (GET_MODE_SIZE (mode) > 4)
4009     {
4010       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
4011           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4012         return 1;
4013     }
4014   else
4015     {
4016       /* Ignore sign extension.  */
4017       val &= (HOST_WIDE_INT) 0xffffffff;
4018     }
4019   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4020           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4021 }
4022
4023 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4024
4025 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4026   {
4027     0x0000000100000001ull,
4028     0x0001000100010001ull,
4029     0x0101010101010101ull,
4030     0x1111111111111111ull,
4031     0x5555555555555555ull,
4032   };
4033
4034
4035 /* Return true if val is a valid bitmask immediate.  */
4036
4037 bool
4038 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4039 {
4040   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4041   int bits;
4042
4043   /* Check for a single sequence of one bits and return quickly if so.
4044      The special cases of all ones and all zeroes returns false.  */
4045   val = (unsigned HOST_WIDE_INT) val_in;
4046   tmp = val + (val & -val);
4047
4048   if (tmp == (tmp & -tmp))
4049     return (val + 1) > 1;
4050
4051   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4052   if (mode == SImode)
4053     val = (val << 32) | (val & 0xffffffff);
4054
4055   /* Invert if the immediate doesn't start with a zero bit - this means we
4056      only need to search for sequences of one bits.  */
4057   if (val & 1)
4058     val = ~val;
4059
4060   /* Find the first set bit and set tmp to val with the first sequence of one
4061      bits removed.  Return success if there is a single sequence of ones.  */
4062   first_one = val & -val;
4063   tmp = val & (val + first_one);
4064
4065   if (tmp == 0)
4066     return true;
4067
4068   /* Find the next set bit and compute the difference in bit position.  */
4069   next_one = tmp & -tmp;
4070   bits = clz_hwi (first_one) - clz_hwi (next_one);
4071   mask = val ^ tmp;
4072
4073   /* Check the bit position difference is a power of 2, and that the first
4074      sequence of one bits fits within 'bits' bits.  */
4075   if ((mask >> bits) != 0 || bits != (bits & -bits))
4076     return false;
4077
4078   /* Check the sequence of one bits is repeated 64/bits times.  */
4079   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4080 }
4081
4082 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4083    Assumed precondition: VAL_IN Is not zero.  */
4084
4085 unsigned HOST_WIDE_INT
4086 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4087 {
4088   int lowest_bit_set = ctz_hwi (val_in);
4089   int highest_bit_set = floor_log2 (val_in);
4090   gcc_assert (val_in != 0);
4091
4092   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4093           (HOST_WIDE_INT_1U << lowest_bit_set));
4094 }
4095
4096 /* Create constant where bits outside of lowest bit set to highest bit set
4097    are set to 1.  */
4098
4099 unsigned HOST_WIDE_INT
4100 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4101 {
4102   return val_in | ~aarch64_and_split_imm1 (val_in);
4103 }
4104
4105 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4106
4107 bool
4108 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4109 {
4110   scalar_int_mode int_mode;
4111   if (!is_a <scalar_int_mode> (mode, &int_mode))
4112     return false;
4113
4114   if (aarch64_bitmask_imm (val_in, int_mode))
4115     return false;
4116
4117   if (aarch64_move_imm (val_in, int_mode))
4118     return false;
4119
4120   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4121
4122   return aarch64_bitmask_imm (imm2, int_mode);
4123 }
4124
4125 /* Return true if val is an immediate that can be loaded into a
4126    register in a single instruction.  */
4127 bool
4128 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4129 {
4130   scalar_int_mode int_mode;
4131   if (!is_a <scalar_int_mode> (mode, &int_mode))
4132     return false;
4133
4134   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4135     return 1;
4136   return aarch64_bitmask_imm (val, int_mode);
4137 }
4138
4139 static bool
4140 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4141 {
4142   rtx base, offset;
4143
4144   if (GET_CODE (x) == HIGH)
4145     return true;
4146
4147   split_const (x, &base, &offset);
4148   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4149     {
4150       if (aarch64_classify_symbol (base, offset)
4151           != SYMBOL_FORCE_TO_MEM)
4152         return true;
4153       else
4154         /* Avoid generating a 64-bit relocation in ILP32; leave
4155            to aarch64_expand_mov_immediate to handle it properly.  */
4156         return mode != ptr_mode;
4157     }
4158
4159   return aarch64_tls_referenced_p (x);
4160 }
4161
4162 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4163    The expansion for a table switch is quite expensive due to the number
4164    of instructions, the table lookup and hard to predict indirect jump.
4165    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4166    set, otherwise use tables for > 16 cases as a tradeoff between size and
4167    performance.  When optimizing for size, use the default setting.  */
4168
4169 static unsigned int
4170 aarch64_case_values_threshold (void)
4171 {
4172   /* Use the specified limit for the number of cases before using jump
4173      tables at higher optimization levels.  */
4174   if (optimize > 2
4175       && selected_cpu->tune->max_case_values != 0)
4176     return selected_cpu->tune->max_case_values;
4177   else
4178     return optimize_size ? default_case_values_threshold () : 17;
4179 }
4180
4181 /* Return true if register REGNO is a valid index register.
4182    STRICT_P is true if REG_OK_STRICT is in effect.  */
4183
4184 bool
4185 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4186 {
4187   if (!HARD_REGISTER_NUM_P (regno))
4188     {
4189       if (!strict_p)
4190         return true;
4191
4192       if (!reg_renumber)
4193         return false;
4194
4195       regno = reg_renumber[regno];
4196     }
4197   return GP_REGNUM_P (regno);
4198 }
4199
4200 /* Return true if register REGNO is a valid base register for mode MODE.
4201    STRICT_P is true if REG_OK_STRICT is in effect.  */
4202
4203 bool
4204 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4205 {
4206   if (!HARD_REGISTER_NUM_P (regno))
4207     {
4208       if (!strict_p)
4209         return true;
4210
4211       if (!reg_renumber)
4212         return false;
4213
4214       regno = reg_renumber[regno];
4215     }
4216
4217   /* The fake registers will be eliminated to either the stack or
4218      hard frame pointer, both of which are usually valid base registers.
4219      Reload deals with the cases where the eliminated form isn't valid.  */
4220   return (GP_REGNUM_P (regno)
4221           || regno == SP_REGNUM
4222           || regno == FRAME_POINTER_REGNUM
4223           || regno == ARG_POINTER_REGNUM);
4224 }
4225
4226 /* Return true if X is a valid base register for mode MODE.
4227    STRICT_P is true if REG_OK_STRICT is in effect.  */
4228
4229 static bool
4230 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4231 {
4232   if (!strict_p
4233       && GET_CODE (x) == SUBREG
4234       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4235     x = SUBREG_REG (x);
4236
4237   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4238 }
4239
4240 /* Return true if address offset is a valid index.  If it is, fill in INFO
4241    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4242
4243 static bool
4244 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4245                         machine_mode mode, bool strict_p)
4246 {
4247   enum aarch64_address_type type;
4248   rtx index;
4249   int shift;
4250
4251   /* (reg:P) */
4252   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4253       && GET_MODE (x) == Pmode)
4254     {
4255       type = ADDRESS_REG_REG;
4256       index = x;
4257       shift = 0;
4258     }
4259   /* (sign_extend:DI (reg:SI)) */
4260   else if ((GET_CODE (x) == SIGN_EXTEND
4261             || GET_CODE (x) == ZERO_EXTEND)
4262            && GET_MODE (x) == DImode
4263            && GET_MODE (XEXP (x, 0)) == SImode)
4264     {
4265       type = (GET_CODE (x) == SIGN_EXTEND)
4266         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267       index = XEXP (x, 0);
4268       shift = 0;
4269     }
4270   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4271   else if (GET_CODE (x) == MULT
4272            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4273                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4274            && GET_MODE (XEXP (x, 0)) == DImode
4275            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4276            && CONST_INT_P (XEXP (x, 1)))
4277     {
4278       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4279         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280       index = XEXP (XEXP (x, 0), 0);
4281       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4282     }
4283   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4284   else if (GET_CODE (x) == ASHIFT
4285            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4286                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4287            && GET_MODE (XEXP (x, 0)) == DImode
4288            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4289            && CONST_INT_P (XEXP (x, 1)))
4290     {
4291       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4292         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293       index = XEXP (XEXP (x, 0), 0);
4294       shift = INTVAL (XEXP (x, 1));
4295     }
4296   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4297   else if ((GET_CODE (x) == SIGN_EXTRACT
4298             || GET_CODE (x) == ZERO_EXTRACT)
4299            && GET_MODE (x) == DImode
4300            && GET_CODE (XEXP (x, 0)) == MULT
4301            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4302            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4303     {
4304       type = (GET_CODE (x) == SIGN_EXTRACT)
4305         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4306       index = XEXP (XEXP (x, 0), 0);
4307       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4308       if (INTVAL (XEXP (x, 1)) != 32 + shift
4309           || INTVAL (XEXP (x, 2)) != 0)
4310         shift = -1;
4311     }
4312   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4313      (const_int 0xffffffff<<shift)) */
4314   else if (GET_CODE (x) == AND
4315            && GET_MODE (x) == DImode
4316            && GET_CODE (XEXP (x, 0)) == MULT
4317            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4318            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4319            && CONST_INT_P (XEXP (x, 1)))
4320     {
4321       type = ADDRESS_REG_UXTW;
4322       index = XEXP (XEXP (x, 0), 0);
4323       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4324       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4325         shift = -1;
4326     }
4327   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4328   else if ((GET_CODE (x) == SIGN_EXTRACT
4329             || GET_CODE (x) == ZERO_EXTRACT)
4330            && GET_MODE (x) == DImode
4331            && GET_CODE (XEXP (x, 0)) == ASHIFT
4332            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4333            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4334     {
4335       type = (GET_CODE (x) == SIGN_EXTRACT)
4336         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4337       index = XEXP (XEXP (x, 0), 0);
4338       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4339       if (INTVAL (XEXP (x, 1)) != 32 + shift
4340           || INTVAL (XEXP (x, 2)) != 0)
4341         shift = -1;
4342     }
4343   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4344      (const_int 0xffffffff<<shift)) */
4345   else if (GET_CODE (x) == AND
4346            && GET_MODE (x) == DImode
4347            && GET_CODE (XEXP (x, 0)) == ASHIFT
4348            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4349            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4350            && CONST_INT_P (XEXP (x, 1)))
4351     {
4352       type = ADDRESS_REG_UXTW;
4353       index = XEXP (XEXP (x, 0), 0);
4354       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4355       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4356         shift = -1;
4357     }
4358   /* (mult:P (reg:P) (const_int scale)) */
4359   else if (GET_CODE (x) == MULT
4360            && GET_MODE (x) == Pmode
4361            && GET_MODE (XEXP (x, 0)) == Pmode
4362            && CONST_INT_P (XEXP (x, 1)))
4363     {
4364       type = ADDRESS_REG_REG;
4365       index = XEXP (x, 0);
4366       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4367     }
4368   /* (ashift:P (reg:P) (const_int shift)) */
4369   else if (GET_CODE (x) == ASHIFT
4370            && GET_MODE (x) == Pmode
4371            && GET_MODE (XEXP (x, 0)) == Pmode
4372            && CONST_INT_P (XEXP (x, 1)))
4373     {
4374       type = ADDRESS_REG_REG;
4375       index = XEXP (x, 0);
4376       shift = INTVAL (XEXP (x, 1));
4377     }
4378   else
4379     return false;
4380
4381   if (!strict_p
4382       && GET_CODE (index) == SUBREG
4383       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4384     index = SUBREG_REG (index);
4385
4386   if ((shift == 0 ||
4387        (shift > 0 && shift <= 3
4388         && (1 << shift) == GET_MODE_SIZE (mode)))
4389       && REG_P (index)
4390       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4391     {
4392       info->type = type;
4393       info->offset = index;
4394       info->shift = shift;
4395       return true;
4396     }
4397
4398   return false;
4399 }
4400
4401 /* Return true if MODE is one of the modes for which we
4402    support LDP/STP operations.  */
4403
4404 static bool
4405 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4406 {
4407   return mode == SImode || mode == DImode
4408          || mode == SFmode || mode == DFmode
4409          || (aarch64_vector_mode_supported_p (mode)
4410              && GET_MODE_SIZE (mode) == 8);
4411 }
4412
4413 /* Return true if REGNO is a virtual pointer register, or an eliminable
4414    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4415    include stack_pointer or hard_frame_pointer.  */
4416 static bool
4417 virt_or_elim_regno_p (unsigned regno)
4418 {
4419   return ((regno >= FIRST_VIRTUAL_REGISTER
4420            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4421           || regno == FRAME_POINTER_REGNUM
4422           || regno == ARG_POINTER_REGNUM);
4423 }
4424
4425 /* Return true if X is a valid address for machine mode MODE.  If it is,
4426    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4427    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4428
4429 static bool
4430 aarch64_classify_address (struct aarch64_address_info *info,
4431                           rtx x, machine_mode mode,
4432                           RTX_CODE outer_code, bool strict_p)
4433 {
4434   enum rtx_code code = GET_CODE (x);
4435   rtx op0, op1;
4436
4437   /* On BE, we use load/store pair for all large int mode load/stores.
4438      TI/TFmode may also use a load/store pair.  */
4439   bool load_store_pair_p = (outer_code == PARALLEL
4440                             || mode == TImode
4441                             || mode == TFmode
4442                             || (BYTES_BIG_ENDIAN
4443                                 && aarch64_vect_struct_mode_p (mode)));
4444
4445   bool allow_reg_index_p =
4446     !load_store_pair_p
4447     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4448     && !aarch64_vect_struct_mode_p (mode);
4449
4450   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4451      REG addressing.  */
4452   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4453       && (code != POST_INC && code != REG))
4454     return false;
4455
4456   switch (code)
4457     {
4458     case REG:
4459     case SUBREG:
4460       info->type = ADDRESS_REG_IMM;
4461       info->base = x;
4462       info->offset = const0_rtx;
4463       return aarch64_base_register_rtx_p (x, strict_p);
4464
4465     case PLUS:
4466       op0 = XEXP (x, 0);
4467       op1 = XEXP (x, 1);
4468
4469       if (! strict_p
4470           && REG_P (op0)
4471           && virt_or_elim_regno_p (REGNO (op0))
4472           && CONST_INT_P (op1))
4473         {
4474           info->type = ADDRESS_REG_IMM;
4475           info->base = op0;
4476           info->offset = op1;
4477
4478           return true;
4479         }
4480
4481       if (GET_MODE_SIZE (mode) != 0
4482           && CONST_INT_P (op1)
4483           && aarch64_base_register_rtx_p (op0, strict_p))
4484         {
4485           HOST_WIDE_INT offset = INTVAL (op1);
4486
4487           info->type = ADDRESS_REG_IMM;
4488           info->base = op0;
4489           info->offset = op1;
4490
4491           /* TImode and TFmode values are allowed in both pairs of X
4492              registers and individual Q registers.  The available
4493              address modes are:
4494              X,X: 7-bit signed scaled offset
4495              Q:   9-bit signed offset
4496              We conservatively require an offset representable in either mode.
4497              When performing the check for pairs of X registers i.e.  LDP/STP
4498              pass down DImode since that is the natural size of the LDP/STP
4499              instruction memory accesses.  */
4500           if (mode == TImode || mode == TFmode)
4501             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4502                     && (offset_9bit_signed_unscaled_p (mode, offset)
4503                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4504
4505           /* A 7bit offset check because OImode will emit a ldp/stp
4506              instruction (only big endian will get here).
4507              For ldp/stp instructions, the offset is scaled for the size of a
4508              single element of the pair.  */
4509           if (mode == OImode)
4510             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4511
4512           /* Three 9/12 bit offsets checks because CImode will emit three
4513              ldr/str instructions (only big endian will get here).  */
4514           if (mode == CImode)
4515             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4516                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4517                         || offset_12bit_unsigned_scaled_p (V16QImode,
4518                                                            offset + 32)));
4519
4520           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4521              instructions (only big endian will get here).  */
4522           if (mode == XImode)
4523             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4524                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4525                                                             offset + 32));
4526
4527           if (load_store_pair_p)
4528             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4529                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4530           else
4531             return (offset_9bit_signed_unscaled_p (mode, offset)
4532                     || offset_12bit_unsigned_scaled_p (mode, offset));
4533         }
4534
4535       if (allow_reg_index_p)
4536         {
4537           /* Look for base + (scaled/extended) index register.  */
4538           if (aarch64_base_register_rtx_p (op0, strict_p)
4539               && aarch64_classify_index (info, op1, mode, strict_p))
4540             {
4541               info->base = op0;
4542               return true;
4543             }
4544           if (aarch64_base_register_rtx_p (op1, strict_p)
4545               && aarch64_classify_index (info, op0, mode, strict_p))
4546             {
4547               info->base = op1;
4548               return true;
4549             }
4550         }
4551
4552       return false;
4553
4554     case POST_INC:
4555     case POST_DEC:
4556     case PRE_INC:
4557     case PRE_DEC:
4558       info->type = ADDRESS_REG_WB;
4559       info->base = XEXP (x, 0);
4560       info->offset = NULL_RTX;
4561       return aarch64_base_register_rtx_p (info->base, strict_p);
4562
4563     case POST_MODIFY:
4564     case PRE_MODIFY:
4565       info->type = ADDRESS_REG_WB;
4566       info->base = XEXP (x, 0);
4567       if (GET_CODE (XEXP (x, 1)) == PLUS
4568           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4569           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4570           && aarch64_base_register_rtx_p (info->base, strict_p))
4571         {
4572           HOST_WIDE_INT offset;
4573           info->offset = XEXP (XEXP (x, 1), 1);
4574           offset = INTVAL (info->offset);
4575
4576           /* TImode and TFmode values are allowed in both pairs of X
4577              registers and individual Q registers.  The available
4578              address modes are:
4579              X,X: 7-bit signed scaled offset
4580              Q:   9-bit signed offset
4581              We conservatively require an offset representable in either mode.
4582            */
4583           if (mode == TImode || mode == TFmode)
4584             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4585                     && offset_9bit_signed_unscaled_p (mode, offset));
4586
4587           if (load_store_pair_p)
4588             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4589                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4590           else
4591             return offset_9bit_signed_unscaled_p (mode, offset);
4592         }
4593       return false;
4594
4595     case CONST:
4596     case SYMBOL_REF:
4597     case LABEL_REF:
4598       /* load literal: pc-relative constant pool entry.  Only supported
4599          for SI mode or larger.  */
4600       info->type = ADDRESS_SYMBOLIC;
4601
4602       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4603         {
4604           rtx sym, addend;
4605
4606           split_const (x, &sym, &addend);
4607           return ((GET_CODE (sym) == LABEL_REF
4608                    || (GET_CODE (sym) == SYMBOL_REF
4609                        && CONSTANT_POOL_ADDRESS_P (sym)
4610                        && aarch64_pcrelative_literal_loads)));
4611         }
4612       return false;
4613
4614     case LO_SUM:
4615       info->type = ADDRESS_LO_SUM;
4616       info->base = XEXP (x, 0);
4617       info->offset = XEXP (x, 1);
4618       if (allow_reg_index_p
4619           && aarch64_base_register_rtx_p (info->base, strict_p))
4620         {
4621           rtx sym, offs;
4622           split_const (info->offset, &sym, &offs);
4623           if (GET_CODE (sym) == SYMBOL_REF
4624               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4625             {
4626               /* The symbol and offset must be aligned to the access size.  */
4627               unsigned int align;
4628               unsigned int ref_size;
4629
4630               if (CONSTANT_POOL_ADDRESS_P (sym))
4631                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4632               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4633                 {
4634                   tree exp = SYMBOL_REF_DECL (sym);
4635                   align = TYPE_ALIGN (TREE_TYPE (exp));
4636                   align = aarch64_constant_alignment (exp, align);
4637                 }
4638               else if (SYMBOL_REF_DECL (sym))
4639                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4640               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4641                        && SYMBOL_REF_BLOCK (sym) != NULL)
4642                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4643               else
4644                 align = BITS_PER_UNIT;
4645
4646               ref_size = GET_MODE_SIZE (mode);
4647               if (ref_size == 0)
4648                 ref_size = GET_MODE_SIZE (DImode);
4649
4650               return ((INTVAL (offs) & (ref_size - 1)) == 0
4651                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4652             }
4653         }
4654       return false;
4655
4656     default:
4657       return false;
4658     }
4659 }
4660
4661 /* Return true if the address X is valid for a PRFM instruction.
4662    STRICT_P is true if we should do strict checking with
4663    aarch64_classify_address.  */
4664
4665 bool
4666 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4667 {
4668   struct aarch64_address_info addr;
4669
4670   /* PRFM accepts the same addresses as DImode...  */
4671   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4672   if (!res)
4673     return false;
4674
4675   /* ... except writeback forms.  */
4676   return addr.type != ADDRESS_REG_WB;
4677 }
4678
4679 bool
4680 aarch64_symbolic_address_p (rtx x)
4681 {
4682   rtx offset;
4683
4684   split_const (x, &x, &offset);
4685   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4686 }
4687
4688 /* Classify the base of symbolic expression X.  */
4689
4690 enum aarch64_symbol_type
4691 aarch64_classify_symbolic_expression (rtx x)
4692 {
4693   rtx offset;
4694
4695   split_const (x, &x, &offset);
4696   return aarch64_classify_symbol (x, offset);
4697 }
4698
4699
4700 /* Return TRUE if X is a legitimate address for accessing memory in
4701    mode MODE.  */
4702 static bool
4703 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4704 {
4705   struct aarch64_address_info addr;
4706
4707   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4708 }
4709
4710 /* Return TRUE if X is a legitimate address for accessing memory in
4711    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4712    pair operation.  */
4713 bool
4714 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4715                               RTX_CODE outer_code, bool strict_p)
4716 {
4717   struct aarch64_address_info addr;
4718
4719   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4720 }
4721
4722 /* Split an out-of-range address displacement into a base and offset.
4723    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4724    to increase opportunities for sharing the base address of different sizes.
4725    Unaligned accesses use the signed 9-bit range, TImode/TFmode use
4726    the intersection of signed scaled 7-bit and signed 9-bit offset.  */
4727 static bool
4728 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4729 {
4730   HOST_WIDE_INT offset = INTVAL (*disp);
4731   HOST_WIDE_INT base;
4732
4733   if (mode == TImode || mode == TFmode)
4734     base = (offset + 0x100) & ~0x1f8;
4735   else if ((offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4736     base = (offset + 0x100) & ~0x1ff;
4737   else
4738     base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4739
4740   *off = GEN_INT (base);
4741   *disp = GEN_INT (offset - base);
4742   return true;
4743 }
4744
4745 /* Return the binary representation of floating point constant VALUE in INTVAL.
4746    If the value cannot be converted, return false without setting INTVAL.
4747    The conversion is done in the given MODE.  */
4748 bool
4749 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4750 {
4751
4752   /* We make a general exception for 0.  */
4753   if (aarch64_float_const_zero_rtx_p (value))
4754     {
4755       *intval = 0;
4756       return true;
4757     }
4758
4759   machine_mode mode = GET_MODE (value);
4760   if (GET_CODE (value) != CONST_DOUBLE
4761       || !SCALAR_FLOAT_MODE_P (mode)
4762       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4763       /* Only support up to DF mode.  */
4764       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4765     return false;
4766
4767   unsigned HOST_WIDE_INT ival = 0;
4768
4769   long res[2];
4770   real_to_target (res,
4771                   CONST_DOUBLE_REAL_VALUE (value),
4772                   REAL_MODE_FORMAT (mode));
4773
4774   if (mode == DFmode)
4775     {
4776       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4777       ival = zext_hwi (res[order], 32);
4778       ival |= (zext_hwi (res[1 - order], 32) << 32);
4779     }
4780   else
4781       ival = zext_hwi (res[0], 32);
4782
4783   *intval = ival;
4784   return true;
4785 }
4786
4787 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4788    single MOV(+MOVK) followed by an FMOV.  */
4789 bool
4790 aarch64_float_const_rtx_p (rtx x)
4791 {
4792   machine_mode mode = GET_MODE (x);
4793   if (mode == VOIDmode)
4794     return false;
4795
4796   /* Determine whether it's cheaper to write float constants as
4797      mov/movk pairs over ldr/adrp pairs.  */
4798   unsigned HOST_WIDE_INT ival;
4799
4800   if (GET_CODE (x) == CONST_DOUBLE
4801       && SCALAR_FLOAT_MODE_P (mode)
4802       && aarch64_reinterpret_float_as_int (x, &ival))
4803     {
4804       scalar_int_mode imode = (mode == HFmode
4805                                ? SImode
4806                                : int_mode_for_mode (mode).require ());
4807       int num_instr = aarch64_internal_mov_immediate
4808                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4809       return num_instr < 3;
4810     }
4811
4812   return false;
4813 }
4814
4815 /* Return TRUE if rtx X is immediate constant 0.0 */
4816 bool
4817 aarch64_float_const_zero_rtx_p (rtx x)
4818 {
4819   if (GET_MODE (x) == VOIDmode)
4820     return false;
4821
4822   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4823     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4824   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4825 }
4826
4827 /* Return TRUE if rtx X is immediate constant that fits in a single
4828    MOVI immediate operation.  */
4829 bool
4830 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4831 {
4832   if (!TARGET_SIMD)
4833      return false;
4834
4835   machine_mode vmode;
4836   scalar_int_mode imode;
4837   unsigned HOST_WIDE_INT ival;
4838
4839   if (GET_CODE (x) == CONST_DOUBLE
4840       && SCALAR_FLOAT_MODE_P (mode))
4841     {
4842       if (!aarch64_reinterpret_float_as_int (x, &ival))
4843         return false;
4844
4845       /* We make a general exception for 0.  */
4846       if (aarch64_float_const_zero_rtx_p (x))
4847         return true;
4848
4849       imode = int_mode_for_mode (mode).require ();
4850     }
4851   else if (GET_CODE (x) == CONST_INT
4852            && is_a <scalar_int_mode> (mode, &imode))
4853     ival = INTVAL (x);
4854   else
4855     return false;
4856
4857    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4858      a 128 bit vector mode.  */
4859   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4860
4861   vmode = aarch64_simd_container_mode (imode, width);
4862   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4863
4864   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4865 }
4866
4867
4868 /* Return the fixed registers used for condition codes.  */
4869
4870 static bool
4871 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4872 {
4873   *p1 = CC_REGNUM;
4874   *p2 = INVALID_REGNUM;
4875   return true;
4876 }
4877
4878 /* This function is used by the call expanders of the machine description.
4879    RESULT is the register in which the result is returned.  It's NULL for
4880    "call" and "sibcall".
4881    MEM is the location of the function call.
4882    SIBCALL indicates whether this function call is normal call or sibling call.
4883    It will generate different pattern accordingly.  */
4884
4885 void
4886 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4887 {
4888   rtx call, callee, tmp;
4889   rtvec vec;
4890   machine_mode mode;
4891
4892   gcc_assert (MEM_P (mem));
4893   callee = XEXP (mem, 0);
4894   mode = GET_MODE (callee);
4895   gcc_assert (mode == Pmode);
4896
4897   /* Decide if we should generate indirect calls by loading the
4898      address of the callee into a register before performing
4899      the branch-and-link.  */
4900   if (SYMBOL_REF_P (callee)
4901       ? (aarch64_is_long_call_p (callee)
4902          || aarch64_is_noplt_call_p (callee))
4903       : !REG_P (callee))
4904     XEXP (mem, 0) = force_reg (mode, callee);
4905
4906   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4907
4908   if (result != NULL_RTX)
4909     call = gen_rtx_SET (result, call);
4910
4911   if (sibcall)
4912     tmp = ret_rtx;
4913   else
4914     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4915
4916   vec = gen_rtvec (2, call, tmp);
4917   call = gen_rtx_PARALLEL (VOIDmode, vec);
4918
4919   aarch64_emit_call_insn (call);
4920 }
4921
4922 /* Emit call insn with PAT and do aarch64-specific handling.  */
4923
4924 void
4925 aarch64_emit_call_insn (rtx pat)
4926 {
4927   rtx insn = emit_call_insn (pat);
4928
4929   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4930   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4931   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4932 }
4933
4934 machine_mode
4935 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4936 {
4937   /* All floating point compares return CCFP if it is an equality
4938      comparison, and CCFPE otherwise.  */
4939   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4940     {
4941       switch (code)
4942         {
4943         case EQ:
4944         case NE:
4945         case UNORDERED:
4946         case ORDERED:
4947         case UNLT:
4948         case UNLE:
4949         case UNGT:
4950         case UNGE:
4951         case UNEQ:
4952         case LTGT:
4953           return CCFPmode;
4954
4955         case LT:
4956         case LE:
4957         case GT:
4958         case GE:
4959           return CCFPEmode;
4960
4961         default:
4962           gcc_unreachable ();
4963         }
4964     }
4965
4966   /* Equality comparisons of short modes against zero can be performed
4967      using the TST instruction with the appropriate bitmask.  */
4968   if (y == const0_rtx && REG_P (x)
4969       && (code == EQ || code == NE)
4970       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4971     return CC_NZmode;
4972
4973   /* Similarly, comparisons of zero_extends from shorter modes can
4974      be performed using an ANDS with an immediate mask.  */
4975   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4976       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4977       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4978       && (code == EQ || code == NE))
4979     return CC_NZmode;
4980
4981   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4982       && y == const0_rtx
4983       && (code == EQ || code == NE || code == LT || code == GE)
4984       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4985           || GET_CODE (x) == NEG
4986           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4987               && CONST_INT_P (XEXP (x, 2)))))
4988     return CC_NZmode;
4989
4990   /* A compare with a shifted operand.  Because of canonicalization,
4991      the comparison will have to be swapped when we emit the assembly
4992      code.  */
4993   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4994       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4995       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4996           || GET_CODE (x) == LSHIFTRT
4997           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4998     return CC_SWPmode;
4999
5000   /* Similarly for a negated operand, but we can only do this for
5001      equalities.  */
5002   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
5003       && (REG_P (y) || GET_CODE (y) == SUBREG)
5004       && (code == EQ || code == NE)
5005       && GET_CODE (x) == NEG)
5006     return CC_Zmode;
5007
5008   /* A test for unsigned overflow.  */
5009   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
5010       && code == NE
5011       && GET_CODE (x) == PLUS
5012       && GET_CODE (y) == ZERO_EXTEND)
5013     return CC_Cmode;
5014
5015   /* For everything else, return CCmode.  */
5016   return CCmode;
5017 }
5018
5019 static int
5020 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5021
5022 int
5023 aarch64_get_condition_code (rtx x)
5024 {
5025   machine_mode mode = GET_MODE (XEXP (x, 0));
5026   enum rtx_code comp_code = GET_CODE (x);
5027
5028   if (GET_MODE_CLASS (mode) != MODE_CC)
5029     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5030   return aarch64_get_condition_code_1 (mode, comp_code);
5031 }
5032
5033 static int
5034 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5035 {
5036   switch (mode)
5037     {
5038     case E_CCFPmode:
5039     case E_CCFPEmode:
5040       switch (comp_code)
5041         {
5042         case GE: return AARCH64_GE;
5043         case GT: return AARCH64_GT;
5044         case LE: return AARCH64_LS;
5045         case LT: return AARCH64_MI;
5046         case NE: return AARCH64_NE;
5047         case EQ: return AARCH64_EQ;
5048         case ORDERED: return AARCH64_VC;
5049         case UNORDERED: return AARCH64_VS;
5050         case UNLT: return AARCH64_LT;
5051         case UNLE: return AARCH64_LE;
5052         case UNGT: return AARCH64_HI;
5053         case UNGE: return AARCH64_PL;
5054         default: return -1;
5055         }
5056       break;
5057
5058     case E_CCmode:
5059       switch (comp_code)
5060         {
5061         case NE: return AARCH64_NE;
5062         case EQ: return AARCH64_EQ;
5063         case GE: return AARCH64_GE;
5064         case GT: return AARCH64_GT;
5065         case LE: return AARCH64_LE;
5066         case LT: return AARCH64_LT;
5067         case GEU: return AARCH64_CS;
5068         case GTU: return AARCH64_HI;
5069         case LEU: return AARCH64_LS;
5070         case LTU: return AARCH64_CC;
5071         default: return -1;
5072         }
5073       break;
5074
5075     case E_CC_SWPmode:
5076       switch (comp_code)
5077         {
5078         case NE: return AARCH64_NE;
5079         case EQ: return AARCH64_EQ;
5080         case GE: return AARCH64_LE;
5081         case GT: return AARCH64_LT;
5082         case LE: return AARCH64_GE;
5083         case LT: return AARCH64_GT;
5084         case GEU: return AARCH64_LS;
5085         case GTU: return AARCH64_CC;
5086         case LEU: return AARCH64_CS;
5087         case LTU: return AARCH64_HI;
5088         default: return -1;
5089         }
5090       break;
5091
5092     case E_CC_NZmode:
5093       switch (comp_code)
5094         {
5095         case NE: return AARCH64_NE;
5096         case EQ: return AARCH64_EQ;
5097         case GE: return AARCH64_PL;
5098         case LT: return AARCH64_MI;
5099         default: return -1;
5100         }
5101       break;
5102
5103     case E_CC_Zmode:
5104       switch (comp_code)
5105         {
5106         case NE: return AARCH64_NE;
5107         case EQ: return AARCH64_EQ;
5108         default: return -1;
5109         }
5110       break;
5111
5112     case E_CC_Cmode:
5113       switch (comp_code)
5114         {
5115         case NE: return AARCH64_CS;
5116         case EQ: return AARCH64_CC;
5117         default: return -1;
5118         }
5119       break;
5120
5121     default:
5122       return -1;
5123     }
5124
5125   return -1;
5126 }
5127
5128 bool
5129 aarch64_const_vec_all_same_in_range_p (rtx x,
5130                                   HOST_WIDE_INT minval,
5131                                   HOST_WIDE_INT maxval)
5132 {
5133   HOST_WIDE_INT firstval;
5134   int count, i;
5135
5136   if (GET_CODE (x) != CONST_VECTOR
5137       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5138     return false;
5139
5140   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5141   if (firstval < minval || firstval > maxval)
5142     return false;
5143
5144   count = CONST_VECTOR_NUNITS (x);
5145   for (i = 1; i < count; i++)
5146     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5147       return false;
5148
5149   return true;
5150 }
5151
5152 bool
5153 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5154 {
5155   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5156 }
5157
5158
5159 /* N Z C V.  */
5160 #define AARCH64_CC_V 1
5161 #define AARCH64_CC_C (1 << 1)
5162 #define AARCH64_CC_Z (1 << 2)
5163 #define AARCH64_CC_N (1 << 3)
5164
5165 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5166 static const int aarch64_nzcv_codes[] =
5167 {
5168   0,            /* EQ, Z == 1.  */
5169   AARCH64_CC_Z, /* NE, Z == 0.  */
5170   0,            /* CS, C == 1.  */
5171   AARCH64_CC_C, /* CC, C == 0.  */
5172   0,            /* MI, N == 1.  */
5173   AARCH64_CC_N, /* PL, N == 0.  */
5174   0,            /* VS, V == 1.  */
5175   AARCH64_CC_V, /* VC, V == 0.  */
5176   0,            /* HI, C ==1 && Z == 0.  */
5177   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5178   AARCH64_CC_V, /* GE, N == V.  */
5179   0,            /* LT, N != V.  */
5180   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5181   0,            /* LE, !(Z == 0 && N == V).  */
5182   0,            /* AL, Any.  */
5183   0             /* NV, Any.  */
5184 };
5185
5186 /* Print operand X to file F in a target specific manner according to CODE.
5187    The acceptable formatting commands given by CODE are:
5188      'c':               An integer or symbol address without a preceding #
5189                         sign.
5190      'e':               Print the sign/zero-extend size as a character 8->b,
5191                         16->h, 32->w.
5192      'p':               Prints N such that 2^N == X (X must be power of 2 and
5193                         const int).
5194      'P':               Print the number of non-zero bits in X (a const_int).
5195      'H':               Print the higher numbered register of a pair (TImode)
5196                         of regs.
5197      'm':               Print a condition (eq, ne, etc).
5198      'M':               Same as 'm', but invert condition.
5199      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5200      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5201                         The register printed is the FP/SIMD register name
5202                         of X + 0/1/2/3 for S/T/U/V.
5203      'R':               Print a scalar FP/SIMD register name + 1.
5204      'X':               Print bottom 16 bits of integer constant in hex.
5205      'w/x':             Print a general register name or the zero register
5206                         (32-bit or 64-bit).
5207      '0':               Print a normal operand, if it's a general register,
5208                         then we assume DImode.
5209      'k':               Print NZCV for conditional compare instructions.
5210      'A':               Output address constant representing the first
5211                         argument of X, specifying a relocation offset
5212                         if appropriate.
5213      'L':               Output constant address specified by X
5214                         with a relocation offset if appropriate.
5215      'G':               Prints address of X, specifying a PC relative
5216                         relocation mode if appropriate.  */
5217
5218 static void
5219 aarch64_print_operand (FILE *f, rtx x, int code)
5220 {
5221   switch (code)
5222     {
5223     case 'c':
5224       switch (GET_CODE (x))
5225         {
5226         case CONST_INT:
5227           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5228           break;
5229
5230         case SYMBOL_REF:
5231           output_addr_const (f, x);
5232           break;
5233
5234         case CONST:
5235           if (GET_CODE (XEXP (x, 0)) == PLUS
5236               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5237             {
5238               output_addr_const (f, x);
5239               break;
5240             }
5241           /* Fall through.  */
5242
5243         default:
5244           output_operand_lossage ("Unsupported operand for code '%c'", code);
5245         }
5246       break;
5247
5248     case 'e':
5249       {
5250         int n;
5251
5252         if (!CONST_INT_P (x)
5253             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5254           {
5255             output_operand_lossage ("invalid operand for '%%%c'", code);
5256             return;
5257           }
5258
5259         switch (n)
5260           {
5261           case 3:
5262             fputc ('b', f);
5263             break;
5264           case 4:
5265             fputc ('h', f);
5266             break;
5267           case 5:
5268             fputc ('w', f);
5269             break;
5270           default:
5271             output_operand_lossage ("invalid operand for '%%%c'", code);
5272             return;
5273           }
5274       }
5275       break;
5276
5277     case 'p':
5278       {
5279         int n;
5280
5281         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5282           {
5283             output_operand_lossage ("invalid operand for '%%%c'", code);
5284             return;
5285           }
5286
5287         asm_fprintf (f, "%d", n);
5288       }
5289       break;
5290
5291     case 'P':
5292       if (!CONST_INT_P (x))
5293         {
5294           output_operand_lossage ("invalid operand for '%%%c'", code);
5295           return;
5296         }
5297
5298       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5299       break;
5300
5301     case 'H':
5302       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5303         {
5304           output_operand_lossage ("invalid operand for '%%%c'", code);
5305           return;
5306         }
5307
5308       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5309       break;
5310
5311     case 'M':
5312     case 'm':
5313       {
5314         int cond_code;
5315         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5316         if (x == const_true_rtx)
5317           {
5318             if (code == 'M')
5319               fputs ("nv", f);
5320             return;
5321           }
5322
5323         if (!COMPARISON_P (x))
5324           {
5325             output_operand_lossage ("invalid operand for '%%%c'", code);
5326             return;
5327           }
5328
5329         cond_code = aarch64_get_condition_code (x);
5330         gcc_assert (cond_code >= 0);
5331         if (code == 'M')
5332           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5333         fputs (aarch64_condition_codes[cond_code], f);
5334       }
5335       break;
5336
5337     case 'b':
5338     case 'h':
5339     case 's':
5340     case 'd':
5341     case 'q':
5342       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5343         {
5344           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5345           return;
5346         }
5347       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5348       break;
5349
5350     case 'S':
5351     case 'T':
5352     case 'U':
5353     case 'V':
5354       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5355         {
5356           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5357           return;
5358         }
5359       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5360       break;
5361
5362     case 'R':
5363       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5364         {
5365           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5366           return;
5367         }
5368       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5369       break;
5370
5371     case 'X':
5372       if (!CONST_INT_P (x))
5373         {
5374           output_operand_lossage ("invalid operand for '%%%c'", code);
5375           return;
5376         }
5377       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5378       break;
5379
5380     case 'w':
5381     case 'x':
5382       if (x == const0_rtx
5383           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5384         {
5385           asm_fprintf (f, "%czr", code);
5386           break;
5387         }
5388
5389       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5390         {
5391           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5392           break;
5393         }
5394
5395       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5396         {
5397           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5398           break;
5399         }
5400
5401       /* Fall through */
5402
5403     case 0:
5404       if (x == NULL)
5405         {
5406           output_operand_lossage ("missing operand");
5407           return;
5408         }
5409
5410       switch (GET_CODE (x))
5411         {
5412         case REG:
5413           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5414           break;
5415
5416         case MEM:
5417           output_address (GET_MODE (x), XEXP (x, 0));
5418           /* Check all memory references are Pmode - even with ILP32.  */
5419           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5420           break;
5421
5422         case CONST:
5423         case LABEL_REF:
5424         case SYMBOL_REF:
5425           output_addr_const (asm_out_file, x);
5426           break;
5427
5428         case CONST_INT:
5429           asm_fprintf (f, "%wd", INTVAL (x));
5430           break;
5431
5432         case CONST_VECTOR:
5433           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5434             {
5435               gcc_assert (
5436                   aarch64_const_vec_all_same_in_range_p (x,
5437                                                          HOST_WIDE_INT_MIN,
5438                                                          HOST_WIDE_INT_MAX));
5439               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5440             }
5441           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5442             {
5443               fputc ('0', f);
5444             }
5445           else
5446             gcc_unreachable ();
5447           break;
5448
5449         case CONST_DOUBLE:
5450           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5451              be getting CONST_DOUBLEs holding integers.  */
5452           gcc_assert (GET_MODE (x) != VOIDmode);
5453           if (aarch64_float_const_zero_rtx_p (x))
5454             {
5455               fputc ('0', f);
5456               break;
5457             }
5458           else if (aarch64_float_const_representable_p (x))
5459             {
5460 #define buf_size 20
5461               char float_buf[buf_size] = {'\0'};
5462               real_to_decimal_for_mode (float_buf,
5463                                         CONST_DOUBLE_REAL_VALUE (x),
5464                                         buf_size, buf_size,
5465                                         1, GET_MODE (x));
5466               asm_fprintf (asm_out_file, "%s", float_buf);
5467               break;
5468 #undef buf_size
5469             }
5470           output_operand_lossage ("invalid constant");
5471           return;
5472         default:
5473           output_operand_lossage ("invalid operand");
5474           return;
5475         }
5476       break;
5477
5478     case 'A':
5479       if (GET_CODE (x) == HIGH)
5480         x = XEXP (x, 0);
5481
5482       switch (aarch64_classify_symbolic_expression (x))
5483         {
5484         case SYMBOL_SMALL_GOT_4G:
5485           asm_fprintf (asm_out_file, ":got:");
5486           break;
5487
5488         case SYMBOL_SMALL_TLSGD:
5489           asm_fprintf (asm_out_file, ":tlsgd:");
5490           break;
5491
5492         case SYMBOL_SMALL_TLSDESC:
5493           asm_fprintf (asm_out_file, ":tlsdesc:");
5494           break;
5495
5496         case SYMBOL_SMALL_TLSIE:
5497           asm_fprintf (asm_out_file, ":gottprel:");
5498           break;
5499
5500         case SYMBOL_TLSLE24:
5501           asm_fprintf (asm_out_file, ":tprel:");
5502           break;
5503
5504         case SYMBOL_TINY_GOT:
5505           gcc_unreachable ();
5506           break;
5507
5508         default:
5509           break;
5510         }
5511       output_addr_const (asm_out_file, x);
5512       break;
5513
5514     case 'L':
5515       switch (aarch64_classify_symbolic_expression (x))
5516         {
5517         case SYMBOL_SMALL_GOT_4G:
5518           asm_fprintf (asm_out_file, ":lo12:");
5519           break;
5520
5521         case SYMBOL_SMALL_TLSGD:
5522           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5523           break;
5524
5525         case SYMBOL_SMALL_TLSDESC:
5526           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5527           break;
5528
5529         case SYMBOL_SMALL_TLSIE:
5530           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5531           break;
5532
5533         case SYMBOL_TLSLE12:
5534           asm_fprintf (asm_out_file, ":tprel_lo12:");
5535           break;
5536
5537         case SYMBOL_TLSLE24:
5538           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5539           break;
5540
5541         case SYMBOL_TINY_GOT:
5542           asm_fprintf (asm_out_file, ":got:");
5543           break;
5544
5545         case SYMBOL_TINY_TLSIE:
5546           asm_fprintf (asm_out_file, ":gottprel:");
5547           break;
5548
5549         default:
5550           break;
5551         }
5552       output_addr_const (asm_out_file, x);
5553       break;
5554
5555     case 'G':
5556       switch (aarch64_classify_symbolic_expression (x))
5557         {
5558         case SYMBOL_TLSLE24:
5559           asm_fprintf (asm_out_file, ":tprel_hi12:");
5560           break;
5561         default:
5562           break;
5563         }
5564       output_addr_const (asm_out_file, x);
5565       break;
5566
5567     case 'k':
5568       {
5569         HOST_WIDE_INT cond_code;
5570
5571         if (!CONST_INT_P (x))
5572           {
5573             output_operand_lossage ("invalid operand for '%%%c'", code);
5574             return;
5575           }
5576
5577         cond_code = INTVAL (x);
5578         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5579         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5580       }
5581       break;
5582
5583     default:
5584       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5585       return;
5586     }
5587 }
5588
5589 static void
5590 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5591 {
5592   struct aarch64_address_info addr;
5593
5594   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5595     switch (addr.type)
5596       {
5597       case ADDRESS_REG_IMM:
5598         if (addr.offset == const0_rtx)
5599           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5600         else
5601           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5602                        INTVAL (addr.offset));
5603         return;
5604
5605       case ADDRESS_REG_REG:
5606         if (addr.shift == 0)
5607           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5608                        reg_names [REGNO (addr.offset)]);
5609         else
5610           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5611                        reg_names [REGNO (addr.offset)], addr.shift);
5612         return;
5613
5614       case ADDRESS_REG_UXTW:
5615         if (addr.shift == 0)
5616           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5617                        REGNO (addr.offset) - R0_REGNUM);
5618         else
5619           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5620                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5621         return;
5622
5623       case ADDRESS_REG_SXTW:
5624         if (addr.shift == 0)
5625           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5626                        REGNO (addr.offset) - R0_REGNUM);
5627         else
5628           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5629                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5630         return;
5631
5632       case ADDRESS_REG_WB:
5633         switch (GET_CODE (x))
5634           {
5635           case PRE_INC:
5636             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5637                          GET_MODE_SIZE (mode));
5638             return;
5639           case POST_INC:
5640             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5641                          GET_MODE_SIZE (mode));
5642             return;
5643           case PRE_DEC:
5644             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5645                          GET_MODE_SIZE (mode));
5646             return;
5647           case POST_DEC:
5648             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5649                          GET_MODE_SIZE (mode));
5650             return;
5651           case PRE_MODIFY:
5652             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5653                          INTVAL (addr.offset));
5654             return;
5655           case POST_MODIFY:
5656             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5657                          INTVAL (addr.offset));
5658             return;
5659           default:
5660             break;
5661           }
5662         break;
5663
5664       case ADDRESS_LO_SUM:
5665         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5666         output_addr_const (f, addr.offset);
5667         asm_fprintf (f, "]");
5668         return;
5669
5670       case ADDRESS_SYMBOLIC:
5671         break;
5672       }
5673
5674   output_addr_const (f, x);
5675 }
5676
5677 bool
5678 aarch64_label_mentioned_p (rtx x)
5679 {
5680   const char *fmt;
5681   int i;
5682
5683   if (GET_CODE (x) == LABEL_REF)
5684     return true;
5685
5686   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5687      referencing instruction, but they are constant offsets, not
5688      symbols.  */
5689   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5690     return false;
5691
5692   fmt = GET_RTX_FORMAT (GET_CODE (x));
5693   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5694     {
5695       if (fmt[i] == 'E')
5696         {
5697           int j;
5698
5699           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5700             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5701               return 1;
5702         }
5703       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5704         return 1;
5705     }
5706
5707   return 0;
5708 }
5709
5710 /* Implement REGNO_REG_CLASS.  */
5711
5712 enum reg_class
5713 aarch64_regno_regclass (unsigned regno)
5714 {
5715   if (GP_REGNUM_P (regno))
5716     return GENERAL_REGS;
5717
5718   if (regno == SP_REGNUM)
5719     return STACK_REG;
5720
5721   if (regno == FRAME_POINTER_REGNUM
5722       || regno == ARG_POINTER_REGNUM)
5723     return POINTER_REGS;
5724
5725   if (FP_REGNUM_P (regno))
5726     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5727
5728   return NO_REGS;
5729 }
5730
5731 static rtx
5732 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5733 {
5734   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5735      where mask is selected by alignment and size of the offset.
5736      We try to pick as large a range for the offset as possible to
5737      maximize the chance of a CSE.  However, for aligned addresses
5738      we limit the range to 4k so that structures with different sized
5739      elements are likely to use the same base.  We need to be careful
5740      not to split a CONST for some forms of address expression, otherwise
5741      it will generate sub-optimal code.  */
5742
5743   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5744     {
5745       rtx base = XEXP (x, 0);
5746       rtx offset_rtx = XEXP (x, 1);
5747       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5748
5749       if (GET_CODE (base) == PLUS)
5750         {
5751           rtx op0 = XEXP (base, 0);
5752           rtx op1 = XEXP (base, 1);
5753
5754           /* Force any scaling into a temp for CSE.  */
5755           op0 = force_reg (Pmode, op0);
5756           op1 = force_reg (Pmode, op1);
5757
5758           /* Let the pointer register be in op0.  */
5759           if (REG_POINTER (op1))
5760             std::swap (op0, op1);
5761
5762           /* If the pointer is virtual or frame related, then we know that
5763              virtual register instantiation or register elimination is going
5764              to apply a second constant.  We want the two constants folded
5765              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5766           if (virt_or_elim_regno_p (REGNO (op0)))
5767             {
5768               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5769                                    NULL_RTX, true, OPTAB_DIRECT);
5770               return gen_rtx_PLUS (Pmode, base, op1);
5771             }
5772
5773           /* Otherwise, in order to encourage CSE (and thence loop strength
5774              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5775           base = expand_binop (Pmode, add_optab, op0, op1,
5776                                NULL_RTX, true, OPTAB_DIRECT);
5777           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5778         }
5779
5780       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5781       HOST_WIDE_INT base_offset;
5782       if (GET_MODE_SIZE (mode) > 16)
5783         base_offset = (offset + 0x400) & ~0x7f0;
5784       /* For offsets aren't a multiple of the access size, the limit is
5785          -256...255.  */
5786       else if (offset & (GET_MODE_SIZE (mode) - 1))
5787         {
5788           base_offset = (offset + 0x100) & ~0x1ff;
5789
5790           /* BLKmode typically uses LDP of X-registers.  */
5791           if (mode == BLKmode)
5792             base_offset = (offset + 512) & ~0x3ff;
5793         }
5794       /* Small negative offsets are supported.  */
5795       else if (IN_RANGE (offset, -256, 0))
5796         base_offset = 0;
5797       else if (mode == TImode || mode == TFmode)
5798         base_offset = (offset + 0x100) & ~0x1ff;
5799       /* Use 12-bit offset by access size.  */
5800       else
5801         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5802
5803       if (base_offset != 0)
5804         {
5805           base = plus_constant (Pmode, base, base_offset);
5806           base = force_operand (base, NULL_RTX);
5807           return plus_constant (Pmode, base, offset - base_offset);
5808         }
5809     }
5810
5811   return x;
5812 }
5813
5814 /* Return the reload icode required for a constant pool in mode.  */
5815 static enum insn_code
5816 aarch64_constant_pool_reload_icode (machine_mode mode)
5817 {
5818   switch (mode)
5819     {
5820     case E_SFmode:
5821       return CODE_FOR_aarch64_reload_movcpsfdi;
5822
5823     case E_DFmode:
5824       return CODE_FOR_aarch64_reload_movcpdfdi;
5825
5826     case E_TFmode:
5827       return CODE_FOR_aarch64_reload_movcptfdi;
5828
5829     case E_V8QImode:
5830       return CODE_FOR_aarch64_reload_movcpv8qidi;
5831
5832     case E_V16QImode:
5833       return CODE_FOR_aarch64_reload_movcpv16qidi;
5834
5835     case E_V4HImode:
5836       return CODE_FOR_aarch64_reload_movcpv4hidi;
5837
5838     case E_V8HImode:
5839       return CODE_FOR_aarch64_reload_movcpv8hidi;
5840
5841     case E_V2SImode:
5842       return CODE_FOR_aarch64_reload_movcpv2sidi;
5843
5844     case E_V4SImode:
5845       return CODE_FOR_aarch64_reload_movcpv4sidi;
5846
5847     case E_V2DImode:
5848       return CODE_FOR_aarch64_reload_movcpv2didi;
5849
5850     case E_V2DFmode:
5851       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5852
5853     default:
5854       gcc_unreachable ();
5855     }
5856
5857   gcc_unreachable ();
5858 }
5859 static reg_class_t
5860 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5861                           reg_class_t rclass,
5862                           machine_mode mode,
5863                           secondary_reload_info *sri)
5864 {
5865
5866   /* If we have to disable direct literal pool loads and stores because the
5867      function is too big, then we need a scratch register.  */
5868   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5869       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5870           || targetm.vector_mode_supported_p (GET_MODE (x)))
5871       && !aarch64_pcrelative_literal_loads)
5872     {
5873       sri->icode = aarch64_constant_pool_reload_icode (mode);
5874       return NO_REGS;
5875     }
5876
5877   /* Without the TARGET_SIMD instructions we cannot move a Q register
5878      to a Q register directly.  We need a scratch.  */
5879   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5880       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5881       && reg_class_subset_p (rclass, FP_REGS))
5882     {
5883       if (mode == TFmode)
5884         sri->icode = CODE_FOR_aarch64_reload_movtf;
5885       else if (mode == TImode)
5886         sri->icode = CODE_FOR_aarch64_reload_movti;
5887       return NO_REGS;
5888     }
5889
5890   /* A TFmode or TImode memory access should be handled via an FP_REGS
5891      because AArch64 has richer addressing modes for LDR/STR instructions
5892      than LDP/STP instructions.  */
5893   if (TARGET_FLOAT && rclass == GENERAL_REGS
5894       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5895     return FP_REGS;
5896
5897   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5898       return GENERAL_REGS;
5899
5900   return NO_REGS;
5901 }
5902
5903 static bool
5904 aarch64_can_eliminate (const int from, const int to)
5905 {
5906   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5907      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5908
5909   if (frame_pointer_needed)
5910     {
5911       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5912         return true;
5913       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5914         return false;
5915       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5916           && !cfun->calls_alloca)
5917         return true;
5918       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5919         return true;
5920
5921       return false;
5922     }
5923   else
5924     {
5925       /* If we decided that we didn't need a leaf frame pointer but then used
5926          LR in the function, then we'll want a frame pointer after all, so
5927          prevent this elimination to ensure a frame pointer is used.  */
5928       if (to == STACK_POINTER_REGNUM
5929           && flag_omit_frame_pointer == 2
5930           && flag_omit_leaf_frame_pointer
5931           && df_regs_ever_live_p (LR_REGNUM))
5932         return false;
5933     }
5934
5935   return true;
5936 }
5937
5938 HOST_WIDE_INT
5939 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5940 {
5941   aarch64_layout_frame ();
5942
5943   if (to == HARD_FRAME_POINTER_REGNUM)
5944     {
5945       if (from == ARG_POINTER_REGNUM)
5946         return cfun->machine->frame.hard_fp_offset;
5947
5948       if (from == FRAME_POINTER_REGNUM)
5949         return cfun->machine->frame.hard_fp_offset
5950                - cfun->machine->frame.locals_offset;
5951     }
5952
5953   if (to == STACK_POINTER_REGNUM)
5954     {
5955       if (from == FRAME_POINTER_REGNUM)
5956           return cfun->machine->frame.frame_size
5957                  - cfun->machine->frame.locals_offset;
5958     }
5959
5960   return cfun->machine->frame.frame_size;
5961 }
5962
5963 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5964    previous frame.  */
5965
5966 rtx
5967 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5968 {
5969   if (count != 0)
5970     return const0_rtx;
5971   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5972 }
5973
5974
5975 static void
5976 aarch64_asm_trampoline_template (FILE *f)
5977 {
5978   if (TARGET_ILP32)
5979     {
5980       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5981       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5982     }
5983   else
5984     {
5985       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5986       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5987     }
5988   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5989   assemble_aligned_integer (4, const0_rtx);
5990   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5991   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5992 }
5993
5994 static void
5995 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5996 {
5997   rtx fnaddr, mem, a_tramp;
5998   const int tramp_code_sz = 16;
5999
6000   /* Don't need to copy the trailing D-words, we fill those in below.  */
6001   emit_block_move (m_tramp, assemble_trampoline_template (),
6002                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
6003   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
6004   fnaddr = XEXP (DECL_RTL (fndecl), 0);
6005   if (GET_MODE (fnaddr) != ptr_mode)
6006     fnaddr = convert_memory_address (ptr_mode, fnaddr);
6007   emit_move_insn (mem, fnaddr);
6008
6009   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
6010   emit_move_insn (mem, chain_value);
6011
6012   /* XXX We should really define a "clear_cache" pattern and use
6013      gen_clear_cache().  */
6014   a_tramp = XEXP (m_tramp, 0);
6015   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
6016                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
6017                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6018                      ptr_mode);
6019 }
6020
6021 static unsigned char
6022 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6023 {
6024   switch (regclass)
6025     {
6026     case CALLER_SAVE_REGS:
6027     case POINTER_REGS:
6028     case GENERAL_REGS:
6029     case ALL_REGS:
6030     case POINTER_AND_FP_REGS:
6031     case FP_REGS:
6032     case FP_LO_REGS:
6033       return
6034         aarch64_vector_mode_p (mode)
6035           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6036           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6037     case STACK_REG:
6038       return 1;
6039
6040     case NO_REGS:
6041       return 0;
6042
6043     default:
6044       break;
6045     }
6046   gcc_unreachable ();
6047 }
6048
6049 static reg_class_t
6050 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6051 {
6052   if (regclass == POINTER_REGS)
6053     return GENERAL_REGS;
6054
6055   if (regclass == STACK_REG)
6056     {
6057       if (REG_P(x)
6058           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6059           return regclass;
6060
6061       return NO_REGS;
6062     }
6063
6064   /* Register eliminiation can result in a request for
6065      SP+constant->FP_REGS.  We cannot support such operations which
6066      use SP as source and an FP_REG as destination, so reject out
6067      right now.  */
6068   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6069     {
6070       rtx lhs = XEXP (x, 0);
6071
6072       /* Look through a possible SUBREG introduced by ILP32.  */
6073       if (GET_CODE (lhs) == SUBREG)
6074         lhs = SUBREG_REG (lhs);
6075
6076       gcc_assert (REG_P (lhs));
6077       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6078                                       POINTER_REGS));
6079       return NO_REGS;
6080     }
6081
6082   return regclass;
6083 }
6084
6085 void
6086 aarch64_asm_output_labelref (FILE* f, const char *name)
6087 {
6088   asm_fprintf (f, "%U%s", name);
6089 }
6090
6091 static void
6092 aarch64_elf_asm_constructor (rtx symbol, int priority)
6093 {
6094   if (priority == DEFAULT_INIT_PRIORITY)
6095     default_ctor_section_asm_out_constructor (symbol, priority);
6096   else
6097     {
6098       section *s;
6099       /* While priority is known to be in range [0, 65535], so 18 bytes
6100          would be enough, the compiler might not know that.  To avoid
6101          -Wformat-truncation false positive, use a larger size.  */
6102       char buf[23];
6103       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6104       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6105       switch_to_section (s);
6106       assemble_align (POINTER_SIZE);
6107       assemble_aligned_integer (POINTER_BYTES, symbol);
6108     }
6109 }
6110
6111 static void
6112 aarch64_elf_asm_destructor (rtx symbol, int priority)
6113 {
6114   if (priority == DEFAULT_INIT_PRIORITY)
6115     default_dtor_section_asm_out_destructor (symbol, priority);
6116   else
6117     {
6118       section *s;
6119       /* While priority is known to be in range [0, 65535], so 18 bytes
6120          would be enough, the compiler might not know that.  To avoid
6121          -Wformat-truncation false positive, use a larger size.  */
6122       char buf[23];
6123       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6124       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6125       switch_to_section (s);
6126       assemble_align (POINTER_SIZE);
6127       assemble_aligned_integer (POINTER_BYTES, symbol);
6128     }
6129 }
6130
6131 const char*
6132 aarch64_output_casesi (rtx *operands)
6133 {
6134   char buf[100];
6135   char label[100];
6136   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6137   int index;
6138   static const char *const patterns[4][2] =
6139   {
6140     {
6141       "ldrb\t%w3, [%0,%w1,uxtw]",
6142       "add\t%3, %4, %w3, sxtb #2"
6143     },
6144     {
6145       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6146       "add\t%3, %4, %w3, sxth #2"
6147     },
6148     {
6149       "ldr\t%w3, [%0,%w1,uxtw #2]",
6150       "add\t%3, %4, %w3, sxtw #2"
6151     },
6152     /* We assume that DImode is only generated when not optimizing and
6153        that we don't really need 64-bit address offsets.  That would
6154        imply an object file with 8GB of code in a single function!  */
6155     {
6156       "ldr\t%w3, [%0,%w1,uxtw #2]",
6157       "add\t%3, %4, %w3, sxtw #2"
6158     }
6159   };
6160
6161   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6162
6163   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6164   index = exact_log2 (GET_MODE_SIZE (mode));
6165
6166   gcc_assert (index >= 0 && index <= 3);
6167
6168   /* Need to implement table size reduction, by chaning the code below.  */
6169   output_asm_insn (patterns[index][0], operands);
6170   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6171   snprintf (buf, sizeof (buf),
6172             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6173   output_asm_insn (buf, operands);
6174   output_asm_insn (patterns[index][1], operands);
6175   output_asm_insn ("br\t%3", operands);
6176   assemble_label (asm_out_file, label);
6177   return "";
6178 }
6179
6180
6181 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6182    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6183    operator.  */
6184
6185 int
6186 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6187 {
6188   if (shift >= 0 && shift <= 3)
6189     {
6190       int size;
6191       for (size = 8; size <= 32; size *= 2)
6192         {
6193           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6194           if (mask == bits << shift)
6195             return size;
6196         }
6197     }
6198   return 0;
6199 }
6200
6201 /* Constant pools are per function only when PC relative
6202    literal loads are true or we are in the large memory
6203    model.  */
6204
6205 static inline bool
6206 aarch64_can_use_per_function_literal_pools_p (void)
6207 {
6208   return (aarch64_pcrelative_literal_loads
6209           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6210 }
6211
6212 static bool
6213 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6214 {
6215   /* Fixme:: In an ideal world this would work similar
6216      to the logic in aarch64_select_rtx_section but this
6217      breaks bootstrap in gcc go.  For now we workaround
6218      this by returning false here.  */
6219   return false;
6220 }
6221
6222 /* Select appropriate section for constants depending
6223    on where we place literal pools.  */
6224
6225 static section *
6226 aarch64_select_rtx_section (machine_mode mode,
6227                             rtx x,
6228                             unsigned HOST_WIDE_INT align)
6229 {
6230   if (aarch64_can_use_per_function_literal_pools_p ())
6231     return function_section (current_function_decl);
6232
6233   return default_elf_select_rtx_section (mode, x, align);
6234 }
6235
6236 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6237 void
6238 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6239                                   HOST_WIDE_INT offset)
6240 {
6241   /* When using per-function literal pools, we must ensure that any code
6242      section is aligned to the minimal instruction length, lest we get
6243      errors from the assembler re "unaligned instructions".  */
6244   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6245     ASM_OUTPUT_ALIGN (f, 2);
6246 }
6247
6248 /* Costs.  */
6249
6250 /* Helper function for rtx cost calculation.  Strip a shift expression
6251    from X.  Returns the inner operand if successful, or the original
6252    expression on failure.  */
6253 static rtx
6254 aarch64_strip_shift (rtx x)
6255 {
6256   rtx op = x;
6257
6258   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6259      we can convert both to ROR during final output.  */
6260   if ((GET_CODE (op) == ASHIFT
6261        || GET_CODE (op) == ASHIFTRT
6262        || GET_CODE (op) == LSHIFTRT
6263        || GET_CODE (op) == ROTATERT
6264        || GET_CODE (op) == ROTATE)
6265       && CONST_INT_P (XEXP (op, 1)))
6266     return XEXP (op, 0);
6267
6268   if (GET_CODE (op) == MULT
6269       && CONST_INT_P (XEXP (op, 1))
6270       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6271     return XEXP (op, 0);
6272
6273   return x;
6274 }
6275
6276 /* Helper function for rtx cost calculation.  Strip an extend
6277    expression from X.  Returns the inner operand if successful, or the
6278    original expression on failure.  We deal with a number of possible
6279    canonicalization variations here. If STRIP_SHIFT is true, then
6280    we can strip off a shift also.  */
6281 static rtx
6282 aarch64_strip_extend (rtx x, bool strip_shift)
6283 {
6284   scalar_int_mode mode;
6285   rtx op = x;
6286
6287   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6288     return op;
6289
6290   /* Zero and sign extraction of a widened value.  */
6291   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6292       && XEXP (op, 2) == const0_rtx
6293       && GET_CODE (XEXP (op, 0)) == MULT
6294       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6295                                          XEXP (op, 1)))
6296     return XEXP (XEXP (op, 0), 0);
6297
6298   /* It can also be represented (for zero-extend) as an AND with an
6299      immediate.  */
6300   if (GET_CODE (op) == AND
6301       && GET_CODE (XEXP (op, 0)) == MULT
6302       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6303       && CONST_INT_P (XEXP (op, 1))
6304       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6305                            INTVAL (XEXP (op, 1))) != 0)
6306     return XEXP (XEXP (op, 0), 0);
6307
6308   /* Now handle extended register, as this may also have an optional
6309      left shift by 1..4.  */
6310   if (strip_shift
6311       && GET_CODE (op) == ASHIFT
6312       && CONST_INT_P (XEXP (op, 1))
6313       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6314     op = XEXP (op, 0);
6315
6316   if (GET_CODE (op) == ZERO_EXTEND
6317       || GET_CODE (op) == SIGN_EXTEND)
6318     op = XEXP (op, 0);
6319
6320   if (op != x)
6321     return op;
6322
6323   return x;
6324 }
6325
6326 /* Return true iff CODE is a shift supported in combination
6327    with arithmetic instructions.  */
6328
6329 static bool
6330 aarch64_shift_p (enum rtx_code code)
6331 {
6332   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6333 }
6334
6335
6336 /* Return true iff X is a cheap shift without a sign extend. */
6337
6338 static bool
6339 aarch64_cheap_mult_shift_p (rtx x)
6340 {
6341   rtx op0, op1;
6342
6343   op0 = XEXP (x, 0);
6344   op1 = XEXP (x, 1);
6345
6346   if (!(aarch64_tune_params.extra_tuning_flags
6347                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6348     return false;
6349
6350   if (GET_CODE (op0) == SIGN_EXTEND)
6351     return false;
6352
6353   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6354       && UINTVAL (op1) <= 4)
6355     return true;
6356
6357   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6358     return false;
6359
6360   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6361
6362   if (l2 > 0 && l2 <= 4)
6363     return true;
6364
6365   return false;
6366 }
6367
6368 /* Helper function for rtx cost calculation.  Calculate the cost of
6369    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6370    Return the calculated cost of the expression, recursing manually in to
6371    operands where needed.  */
6372
6373 static int
6374 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6375 {
6376   rtx op0, op1;
6377   const struct cpu_cost_table *extra_cost
6378     = aarch64_tune_params.insn_extra_cost;
6379   int cost = 0;
6380   bool compound_p = (outer == PLUS || outer == MINUS);
6381   machine_mode mode = GET_MODE (x);
6382
6383   gcc_checking_assert (code == MULT);
6384
6385   op0 = XEXP (x, 0);
6386   op1 = XEXP (x, 1);
6387
6388   if (VECTOR_MODE_P (mode))
6389     mode = GET_MODE_INNER (mode);
6390
6391   /* Integer multiply/fma.  */
6392   if (GET_MODE_CLASS (mode) == MODE_INT)
6393     {
6394       /* The multiply will be canonicalized as a shift, cost it as such.  */
6395       if (aarch64_shift_p (GET_CODE (x))
6396           || (CONST_INT_P (op1)
6397               && exact_log2 (INTVAL (op1)) > 0))
6398         {
6399           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6400                            || GET_CODE (op0) == SIGN_EXTEND;
6401           if (speed)
6402             {
6403               if (compound_p)
6404                 {
6405                   /* If the shift is considered cheap,
6406                      then don't add any cost. */
6407                   if (aarch64_cheap_mult_shift_p (x))
6408                     ;
6409                   else if (REG_P (op1))
6410                     /* ARITH + shift-by-register.  */
6411                     cost += extra_cost->alu.arith_shift_reg;
6412                   else if (is_extend)
6413                     /* ARITH + extended register.  We don't have a cost field
6414                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6415                     cost += extra_cost->alu.extend_arith;
6416                   else
6417                     /* ARITH + shift-by-immediate.  */
6418                     cost += extra_cost->alu.arith_shift;
6419                 }
6420               else
6421                 /* LSL (immediate).  */
6422                 cost += extra_cost->alu.shift;
6423
6424             }
6425           /* Strip extends as we will have costed them in the case above.  */
6426           if (is_extend)
6427             op0 = aarch64_strip_extend (op0, true);
6428
6429           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6430
6431           return cost;
6432         }
6433
6434       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6435          compound and let the below cases handle it.  After all, MNEG is a
6436          special-case alias of MSUB.  */
6437       if (GET_CODE (op0) == NEG)
6438         {
6439           op0 = XEXP (op0, 0);
6440           compound_p = true;
6441         }
6442
6443       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6444       if ((GET_CODE (op0) == ZERO_EXTEND
6445            && GET_CODE (op1) == ZERO_EXTEND)
6446           || (GET_CODE (op0) == SIGN_EXTEND
6447               && GET_CODE (op1) == SIGN_EXTEND))
6448         {
6449           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6450           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6451
6452           if (speed)
6453             {
6454               if (compound_p)
6455                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6456                 cost += extra_cost->mult[0].extend_add;
6457               else
6458                 /* MUL/SMULL/UMULL.  */
6459                 cost += extra_cost->mult[0].extend;
6460             }
6461
6462           return cost;
6463         }
6464
6465       /* This is either an integer multiply or a MADD.  In both cases
6466          we want to recurse and cost the operands.  */
6467       cost += rtx_cost (op0, mode, MULT, 0, speed);
6468       cost += rtx_cost (op1, mode, MULT, 1, speed);
6469
6470       if (speed)
6471         {
6472           if (compound_p)
6473             /* MADD/MSUB.  */
6474             cost += extra_cost->mult[mode == DImode].add;
6475           else
6476             /* MUL.  */
6477             cost += extra_cost->mult[mode == DImode].simple;
6478         }
6479
6480       return cost;
6481     }
6482   else
6483     {
6484       if (speed)
6485         {
6486           /* Floating-point FMA/FMUL can also support negations of the
6487              operands, unless the rounding mode is upward or downward in
6488              which case FNMUL is different than FMUL with operand negation.  */
6489           bool neg0 = GET_CODE (op0) == NEG;
6490           bool neg1 = GET_CODE (op1) == NEG;
6491           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6492             {
6493               if (neg0)
6494                 op0 = XEXP (op0, 0);
6495               if (neg1)
6496                 op1 = XEXP (op1, 0);
6497             }
6498
6499           if (compound_p)
6500             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6501             cost += extra_cost->fp[mode == DFmode].fma;
6502           else
6503             /* FMUL/FNMUL.  */
6504             cost += extra_cost->fp[mode == DFmode].mult;
6505         }
6506
6507       cost += rtx_cost (op0, mode, MULT, 0, speed);
6508       cost += rtx_cost (op1, mode, MULT, 1, speed);
6509       return cost;
6510     }
6511 }
6512
6513 static int
6514 aarch64_address_cost (rtx x,
6515                       machine_mode mode,
6516                       addr_space_t as ATTRIBUTE_UNUSED,
6517                       bool speed)
6518 {
6519   enum rtx_code c = GET_CODE (x);
6520   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6521   struct aarch64_address_info info;
6522   int cost = 0;
6523   info.shift = 0;
6524
6525   if (!aarch64_classify_address (&info, x, mode, c, false))
6526     {
6527       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6528         {
6529           /* This is a CONST or SYMBOL ref which will be split
6530              in a different way depending on the code model in use.
6531              Cost it through the generic infrastructure.  */
6532           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6533           /* Divide through by the cost of one instruction to
6534              bring it to the same units as the address costs.  */
6535           cost_symbol_ref /= COSTS_N_INSNS (1);
6536           /* The cost is then the cost of preparing the address,
6537              followed by an immediate (possibly 0) offset.  */
6538           return cost_symbol_ref + addr_cost->imm_offset;
6539         }
6540       else
6541         {
6542           /* This is most likely a jump table from a case
6543              statement.  */
6544           return addr_cost->register_offset;
6545         }
6546     }
6547
6548   switch (info.type)
6549     {
6550       case ADDRESS_LO_SUM:
6551       case ADDRESS_SYMBOLIC:
6552       case ADDRESS_REG_IMM:
6553         cost += addr_cost->imm_offset;
6554         break;
6555
6556       case ADDRESS_REG_WB:
6557         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6558           cost += addr_cost->pre_modify;
6559         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6560           cost += addr_cost->post_modify;
6561         else
6562           gcc_unreachable ();
6563
6564         break;
6565
6566       case ADDRESS_REG_REG:
6567         cost += addr_cost->register_offset;
6568         break;
6569
6570       case ADDRESS_REG_SXTW:
6571         cost += addr_cost->register_sextend;
6572         break;
6573
6574       case ADDRESS_REG_UXTW:
6575         cost += addr_cost->register_zextend;
6576         break;
6577
6578       default:
6579         gcc_unreachable ();
6580     }
6581
6582
6583   if (info.shift > 0)
6584     {
6585       /* For the sake of calculating the cost of the shifted register
6586          component, we can treat same sized modes in the same way.  */
6587       switch (GET_MODE_BITSIZE (mode))
6588         {
6589           case 16:
6590             cost += addr_cost->addr_scale_costs.hi;
6591             break;
6592
6593           case 32:
6594             cost += addr_cost->addr_scale_costs.si;
6595             break;
6596
6597           case 64:
6598             cost += addr_cost->addr_scale_costs.di;
6599             break;
6600
6601           /* We can't tell, or this is a 128-bit vector.  */
6602           default:
6603             cost += addr_cost->addr_scale_costs.ti;
6604             break;
6605         }
6606     }
6607
6608   return cost;
6609 }
6610
6611 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6612    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6613    to be taken.  */
6614
6615 int
6616 aarch64_branch_cost (bool speed_p, bool predictable_p)
6617 {
6618   /* When optimizing for speed, use the cost of unpredictable branches.  */
6619   const struct cpu_branch_cost *branch_costs =
6620     aarch64_tune_params.branch_costs;
6621
6622   if (!speed_p || predictable_p)
6623     return branch_costs->predictable;
6624   else
6625     return branch_costs->unpredictable;
6626 }
6627
6628 /* Return true if the RTX X in mode MODE is a zero or sign extract
6629    usable in an ADD or SUB (extended register) instruction.  */
6630 static bool
6631 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6632 {
6633   /* Catch add with a sign extract.
6634      This is add_<optab><mode>_multp2.  */
6635   if (GET_CODE (x) == SIGN_EXTRACT
6636       || GET_CODE (x) == ZERO_EXTRACT)
6637     {
6638       rtx op0 = XEXP (x, 0);
6639       rtx op1 = XEXP (x, 1);
6640       rtx op2 = XEXP (x, 2);
6641
6642       if (GET_CODE (op0) == MULT
6643           && CONST_INT_P (op1)
6644           && op2 == const0_rtx
6645           && CONST_INT_P (XEXP (op0, 1))
6646           && aarch64_is_extend_from_extract (mode,
6647                                              XEXP (op0, 1),
6648                                              op1))
6649         {
6650           return true;
6651         }
6652     }
6653   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6654      No shift.  */
6655   else if (GET_CODE (x) == SIGN_EXTEND
6656            || GET_CODE (x) == ZERO_EXTEND)
6657     return REG_P (XEXP (x, 0));
6658
6659   return false;
6660 }
6661
6662 static bool
6663 aarch64_frint_unspec_p (unsigned int u)
6664 {
6665   switch (u)
6666     {
6667       case UNSPEC_FRINTZ:
6668       case UNSPEC_FRINTP:
6669       case UNSPEC_FRINTM:
6670       case UNSPEC_FRINTA:
6671       case UNSPEC_FRINTN:
6672       case UNSPEC_FRINTX:
6673       case UNSPEC_FRINTI:
6674         return true;
6675
6676       default:
6677         return false;
6678     }
6679 }
6680
6681 /* Return true iff X is an rtx that will match an extr instruction
6682    i.e. as described in the *extr<mode>5_insn family of patterns.
6683    OP0 and OP1 will be set to the operands of the shifts involved
6684    on success and will be NULL_RTX otherwise.  */
6685
6686 static bool
6687 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6688 {
6689   rtx op0, op1;
6690   scalar_int_mode mode;
6691   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6692     return false;
6693
6694   *res_op0 = NULL_RTX;
6695   *res_op1 = NULL_RTX;
6696
6697   if (GET_CODE (x) != IOR)
6698     return false;
6699
6700   op0 = XEXP (x, 0);
6701   op1 = XEXP (x, 1);
6702
6703   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6704       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6705     {
6706      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6707       if (GET_CODE (op1) == ASHIFT)
6708         std::swap (op0, op1);
6709
6710       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6711         return false;
6712
6713       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6714       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6715
6716       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6717           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6718         {
6719           *res_op0 = XEXP (op0, 0);
6720           *res_op1 = XEXP (op1, 0);
6721           return true;
6722         }
6723     }
6724
6725   return false;
6726 }
6727
6728 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6729    storing it in *COST.  Result is true if the total cost of the operation
6730    has now been calculated.  */
6731 static bool
6732 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6733 {
6734   rtx inner;
6735   rtx comparator;
6736   enum rtx_code cmpcode;
6737
6738   if (COMPARISON_P (op0))
6739     {
6740       inner = XEXP (op0, 0);
6741       comparator = XEXP (op0, 1);
6742       cmpcode = GET_CODE (op0);
6743     }
6744   else
6745     {
6746       inner = op0;
6747       comparator = const0_rtx;
6748       cmpcode = NE;
6749     }
6750
6751   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6752     {
6753       /* Conditional branch.  */
6754       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6755         return true;
6756       else
6757         {
6758           if (cmpcode == NE || cmpcode == EQ)
6759             {
6760               if (comparator == const0_rtx)
6761                 {
6762                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6763                   if (GET_CODE (inner) == ZERO_EXTRACT)
6764                     /* TBZ/TBNZ.  */
6765                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6766                                        ZERO_EXTRACT, 0, speed);
6767                   else
6768                     /* CBZ/CBNZ.  */
6769                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6770
6771                 return true;
6772               }
6773             }
6774           else if (cmpcode == LT || cmpcode == GE)
6775             {
6776               /* TBZ/TBNZ.  */
6777               if (comparator == const0_rtx)
6778                 return true;
6779             }
6780         }
6781     }
6782   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6783     {
6784       /* CCMP.  */
6785       if (GET_CODE (op1) == COMPARE)
6786         {
6787           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6788           if (XEXP (op1, 1) == const0_rtx)
6789             *cost += 1;
6790           if (speed)
6791             {
6792               machine_mode mode = GET_MODE (XEXP (op1, 0));
6793               const struct cpu_cost_table *extra_cost
6794                 = aarch64_tune_params.insn_extra_cost;
6795
6796               if (GET_MODE_CLASS (mode) == MODE_INT)
6797                 *cost += extra_cost->alu.arith;
6798               else
6799                 *cost += extra_cost->fp[mode == DFmode].compare;
6800             }
6801           return true;
6802         }
6803
6804       /* It's a conditional operation based on the status flags,
6805          so it must be some flavor of CSEL.  */
6806
6807       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6808       if (GET_CODE (op1) == NEG
6809           || GET_CODE (op1) == NOT
6810           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6811         op1 = XEXP (op1, 0);
6812       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6813         {
6814           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6815           op1 = XEXP (op1, 0);
6816           op2 = XEXP (op2, 0);
6817         }
6818
6819       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6820       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6821       return true;
6822     }
6823
6824   /* We don't know what this is, cost all operands.  */
6825   return false;
6826 }
6827
6828 /* Check whether X is a bitfield operation of the form shift + extend that
6829    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6830    operand to which the bitfield operation is applied.  Otherwise return
6831    NULL_RTX.  */
6832
6833 static rtx
6834 aarch64_extend_bitfield_pattern_p (rtx x)
6835 {
6836   rtx_code outer_code = GET_CODE (x);
6837   machine_mode outer_mode = GET_MODE (x);
6838
6839   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6840       && outer_mode != SImode && outer_mode != DImode)
6841     return NULL_RTX;
6842
6843   rtx inner = XEXP (x, 0);
6844   rtx_code inner_code = GET_CODE (inner);
6845   machine_mode inner_mode = GET_MODE (inner);
6846   rtx op = NULL_RTX;
6847
6848   switch (inner_code)
6849     {
6850       case ASHIFT:
6851         if (CONST_INT_P (XEXP (inner, 1))
6852             && (inner_mode == QImode || inner_mode == HImode))
6853           op = XEXP (inner, 0);
6854         break;
6855       case LSHIFTRT:
6856         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6857             && (inner_mode == QImode || inner_mode == HImode))
6858           op = XEXP (inner, 0);
6859         break;
6860       case ASHIFTRT:
6861         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6862             && (inner_mode == QImode || inner_mode == HImode))
6863           op = XEXP (inner, 0);
6864         break;
6865       default:
6866         break;
6867     }
6868
6869   return op;
6870 }
6871
6872 /* Return true if the mask and a shift amount from an RTX of the form
6873    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6874    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6875
6876 bool
6877 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6878                                     rtx shft_amnt)
6879 {
6880   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6881          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6882          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6883          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6884 }
6885
6886 /* Calculate the cost of calculating X, storing it in *COST.  Result
6887    is true if the total cost of the operation has now been calculated.  */
6888 static bool
6889 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6890                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6891 {
6892   rtx op0, op1, op2;
6893   const struct cpu_cost_table *extra_cost
6894     = aarch64_tune_params.insn_extra_cost;
6895   int code = GET_CODE (x);
6896   scalar_int_mode int_mode;
6897
6898   /* By default, assume that everything has equivalent cost to the
6899      cheapest instruction.  Any additional costs are applied as a delta
6900      above this default.  */
6901   *cost = COSTS_N_INSNS (1);
6902
6903   switch (code)
6904     {
6905     case SET:
6906       /* The cost depends entirely on the operands to SET.  */
6907       *cost = 0;
6908       op0 = SET_DEST (x);
6909       op1 = SET_SRC (x);
6910
6911       switch (GET_CODE (op0))
6912         {
6913         case MEM:
6914           if (speed)
6915             {
6916               rtx address = XEXP (op0, 0);
6917               if (VECTOR_MODE_P (mode))
6918                 *cost += extra_cost->ldst.storev;
6919               else if (GET_MODE_CLASS (mode) == MODE_INT)
6920                 *cost += extra_cost->ldst.store;
6921               else if (mode == SFmode)
6922                 *cost += extra_cost->ldst.storef;
6923               else if (mode == DFmode)
6924                 *cost += extra_cost->ldst.stored;
6925
6926               *cost +=
6927                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6928                                                      0, speed));
6929             }
6930
6931           *cost += rtx_cost (op1, mode, SET, 1, speed);
6932           return true;
6933
6934         case SUBREG:
6935           if (! REG_P (SUBREG_REG (op0)))
6936             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6937
6938           /* Fall through.  */
6939         case REG:
6940           /* The cost is one per vector-register copied.  */
6941           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6942             {
6943               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6944                               / GET_MODE_SIZE (V4SImode);
6945               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6946             }
6947           /* const0_rtx is in general free, but we will use an
6948              instruction to set a register to 0.  */
6949           else if (REG_P (op1) || op1 == const0_rtx)
6950             {
6951               /* The cost is 1 per register copied.  */
6952               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6953                               / UNITS_PER_WORD;
6954               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6955             }
6956           else
6957             /* Cost is just the cost of the RHS of the set.  */
6958             *cost += rtx_cost (op1, mode, SET, 1, speed);
6959           return true;
6960
6961         case ZERO_EXTRACT:
6962         case SIGN_EXTRACT:
6963           /* Bit-field insertion.  Strip any redundant widening of
6964              the RHS to meet the width of the target.  */
6965           if (GET_CODE (op1) == SUBREG)
6966             op1 = SUBREG_REG (op1);
6967           if ((GET_CODE (op1) == ZERO_EXTEND
6968                || GET_CODE (op1) == SIGN_EXTEND)
6969               && CONST_INT_P (XEXP (op0, 1))
6970               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6971               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6972             op1 = XEXP (op1, 0);
6973
6974           if (CONST_INT_P (op1))
6975             {
6976               /* MOV immediate is assumed to always be cheap.  */
6977               *cost = COSTS_N_INSNS (1);
6978             }
6979           else
6980             {
6981               /* BFM.  */
6982               if (speed)
6983                 *cost += extra_cost->alu.bfi;
6984               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6985             }
6986
6987           return true;
6988
6989         default:
6990           /* We can't make sense of this, assume default cost.  */
6991           *cost = COSTS_N_INSNS (1);
6992           return false;
6993         }
6994       return false;
6995
6996     case CONST_INT:
6997       /* If an instruction can incorporate a constant within the
6998          instruction, the instruction's expression avoids calling
6999          rtx_cost() on the constant.  If rtx_cost() is called on a
7000          constant, then it is usually because the constant must be
7001          moved into a register by one or more instructions.
7002
7003          The exception is constant 0, which can be expressed
7004          as XZR/WZR and is therefore free.  The exception to this is
7005          if we have (set (reg) (const0_rtx)) in which case we must cost
7006          the move.  However, we can catch that when we cost the SET, so
7007          we don't need to consider that here.  */
7008       if (x == const0_rtx)
7009         *cost = 0;
7010       else
7011         {
7012           /* To an approximation, building any other constant is
7013              proportionally expensive to the number of instructions
7014              required to build that constant.  This is true whether we
7015              are compiling for SPEED or otherwise.  */
7016           if (!is_a <scalar_int_mode> (mode, &int_mode))
7017             int_mode = word_mode;
7018           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7019                                  (NULL_RTX, x, false, int_mode));
7020         }
7021       return true;
7022
7023     case CONST_DOUBLE:
7024
7025       /* First determine number of instructions to do the move
7026           as an integer constant.  */
7027       if (!aarch64_float_const_representable_p (x)
7028            && !aarch64_can_const_movi_rtx_p (x, mode)
7029            && aarch64_float_const_rtx_p (x))
7030         {
7031           unsigned HOST_WIDE_INT ival;
7032           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7033           gcc_assert (succeed);
7034
7035           scalar_int_mode imode = (mode == HFmode
7036                                    ? SImode
7037                                    : int_mode_for_mode (mode).require ());
7038           int ncost = aarch64_internal_mov_immediate
7039                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7040           *cost += COSTS_N_INSNS (ncost);
7041           return true;
7042         }
7043
7044       if (speed)
7045         {
7046           /* mov[df,sf]_aarch64.  */
7047           if (aarch64_float_const_representable_p (x))
7048             /* FMOV (scalar immediate).  */
7049             *cost += extra_cost->fp[mode == DFmode].fpconst;
7050           else if (!aarch64_float_const_zero_rtx_p (x))
7051             {
7052               /* This will be a load from memory.  */
7053               if (mode == DFmode)
7054                 *cost += extra_cost->ldst.loadd;
7055               else
7056                 *cost += extra_cost->ldst.loadf;
7057             }
7058           else
7059             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7060                or MOV v0.s[0], wzr - neither of which are modeled by the
7061                cost tables.  Just use the default cost.  */
7062             {
7063             }
7064         }
7065
7066       return true;
7067
7068     case MEM:
7069       if (speed)
7070         {
7071           /* For loads we want the base cost of a load, plus an
7072              approximation for the additional cost of the addressing
7073              mode.  */
7074           rtx address = XEXP (x, 0);
7075           if (VECTOR_MODE_P (mode))
7076             *cost += extra_cost->ldst.loadv;
7077           else if (GET_MODE_CLASS (mode) == MODE_INT)
7078             *cost += extra_cost->ldst.load;
7079           else if (mode == SFmode)
7080             *cost += extra_cost->ldst.loadf;
7081           else if (mode == DFmode)
7082             *cost += extra_cost->ldst.loadd;
7083
7084           *cost +=
7085                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7086                                                      0, speed));
7087         }
7088
7089       return true;
7090
7091     case NEG:
7092       op0 = XEXP (x, 0);
7093
7094       if (VECTOR_MODE_P (mode))
7095         {
7096           if (speed)
7097             {
7098               /* FNEG.  */
7099               *cost += extra_cost->vect.alu;
7100             }
7101           return false;
7102         }
7103
7104       if (GET_MODE_CLASS (mode) == MODE_INT)
7105         {
7106           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7107               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7108             {
7109               /* CSETM.  */
7110               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7111               return true;
7112             }
7113
7114           /* Cost this as SUB wzr, X.  */
7115           op0 = CONST0_RTX (mode);
7116           op1 = XEXP (x, 0);
7117           goto cost_minus;
7118         }
7119
7120       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7121         {
7122           /* Support (neg(fma...)) as a single instruction only if
7123              sign of zeros is unimportant.  This matches the decision
7124              making in aarch64.md.  */
7125           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7126             {
7127               /* FNMADD.  */
7128               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7129               return true;
7130             }
7131           if (GET_CODE (op0) == MULT)
7132             {
7133               /* FNMUL.  */
7134               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7135               return true;
7136             }
7137           if (speed)
7138             /* FNEG.  */
7139             *cost += extra_cost->fp[mode == DFmode].neg;
7140           return false;
7141         }
7142
7143       return false;
7144
7145     case CLRSB:
7146     case CLZ:
7147       if (speed)
7148         {
7149           if (VECTOR_MODE_P (mode))
7150             *cost += extra_cost->vect.alu;
7151           else
7152             *cost += extra_cost->alu.clz;
7153         }
7154
7155       return false;
7156
7157     case COMPARE:
7158       op0 = XEXP (x, 0);
7159       op1 = XEXP (x, 1);
7160
7161       if (op1 == const0_rtx
7162           && GET_CODE (op0) == AND)
7163         {
7164           x = op0;
7165           mode = GET_MODE (op0);
7166           goto cost_logic;
7167         }
7168
7169       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7170         {
7171           /* TODO: A write to the CC flags possibly costs extra, this
7172              needs encoding in the cost tables.  */
7173
7174           mode = GET_MODE (op0);
7175           /* ANDS.  */
7176           if (GET_CODE (op0) == AND)
7177             {
7178               x = op0;
7179               goto cost_logic;
7180             }
7181
7182           if (GET_CODE (op0) == PLUS)
7183             {
7184               /* ADDS (and CMN alias).  */
7185               x = op0;
7186               goto cost_plus;
7187             }
7188
7189           if (GET_CODE (op0) == MINUS)
7190             {
7191               /* SUBS.  */
7192               x = op0;
7193               goto cost_minus;
7194             }
7195
7196           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7197               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7198               && CONST_INT_P (XEXP (op0, 2)))
7199             {
7200               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7201                  Handle it here directly rather than going to cost_logic
7202                  since we know the immediate generated for the TST is valid
7203                  so we can avoid creating an intermediate rtx for it only
7204                  for costing purposes.  */
7205               if (speed)
7206                 *cost += extra_cost->alu.logical;
7207
7208               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7209                                  ZERO_EXTRACT, 0, speed);
7210               return true;
7211             }
7212
7213           if (GET_CODE (op1) == NEG)
7214             {
7215               /* CMN.  */
7216               if (speed)
7217                 *cost += extra_cost->alu.arith;
7218
7219               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7220               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7221               return true;
7222             }
7223
7224           /* CMP.
7225
7226              Compare can freely swap the order of operands, and
7227              canonicalization puts the more complex operation first.
7228              But the integer MINUS logic expects the shift/extend
7229              operation in op1.  */
7230           if (! (REG_P (op0)
7231                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7232           {
7233             op0 = XEXP (x, 1);
7234             op1 = XEXP (x, 0);
7235           }
7236           goto cost_minus;
7237         }
7238
7239       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7240         {
7241           /* FCMP.  */
7242           if (speed)
7243             *cost += extra_cost->fp[mode == DFmode].compare;
7244
7245           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7246             {
7247               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7248               /* FCMP supports constant 0.0 for no extra cost. */
7249               return true;
7250             }
7251           return false;
7252         }
7253
7254       if (VECTOR_MODE_P (mode))
7255         {
7256           /* Vector compare.  */
7257           if (speed)
7258             *cost += extra_cost->vect.alu;
7259
7260           if (aarch64_float_const_zero_rtx_p (op1))
7261             {
7262               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7263                  cost.  */
7264               return true;
7265             }
7266           return false;
7267         }
7268       return false;
7269
7270     case MINUS:
7271       {
7272         op0 = XEXP (x, 0);
7273         op1 = XEXP (x, 1);
7274
7275 cost_minus:
7276         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7277
7278         /* Detect valid immediates.  */
7279         if ((GET_MODE_CLASS (mode) == MODE_INT
7280              || (GET_MODE_CLASS (mode) == MODE_CC
7281                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7282             && CONST_INT_P (op1)
7283             && aarch64_uimm12_shift (INTVAL (op1)))
7284           {
7285             if (speed)
7286               /* SUB(S) (immediate).  */
7287               *cost += extra_cost->alu.arith;
7288             return true;
7289           }
7290
7291         /* Look for SUB (extended register).  */
7292         if (is_a <scalar_int_mode> (mode, &int_mode)
7293             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7294           {
7295             if (speed)
7296               *cost += extra_cost->alu.extend_arith;
7297
7298             op1 = aarch64_strip_extend (op1, true);
7299             *cost += rtx_cost (op1, VOIDmode,
7300                                (enum rtx_code) GET_CODE (op1), 0, speed);
7301             return true;
7302           }
7303
7304         rtx new_op1 = aarch64_strip_extend (op1, false);
7305
7306         /* Cost this as an FMA-alike operation.  */
7307         if ((GET_CODE (new_op1) == MULT
7308              || aarch64_shift_p (GET_CODE (new_op1)))
7309             && code != COMPARE)
7310           {
7311             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7312                                             (enum rtx_code) code,
7313                                             speed);
7314             return true;
7315           }
7316
7317         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7318
7319         if (speed)
7320           {
7321             if (VECTOR_MODE_P (mode))
7322               {
7323                 /* Vector SUB.  */
7324                 *cost += extra_cost->vect.alu;
7325               }
7326             else if (GET_MODE_CLASS (mode) == MODE_INT)
7327               {
7328                 /* SUB(S).  */
7329                 *cost += extra_cost->alu.arith;
7330               }
7331             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7332               {
7333                 /* FSUB.  */
7334                 *cost += extra_cost->fp[mode == DFmode].addsub;
7335               }
7336           }
7337         return true;
7338       }
7339
7340     case PLUS:
7341       {
7342         rtx new_op0;
7343
7344         op0 = XEXP (x, 0);
7345         op1 = XEXP (x, 1);
7346
7347 cost_plus:
7348         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7349             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7350           {
7351             /* CSINC.  */
7352             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7353             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7354             return true;
7355           }
7356
7357         if (GET_MODE_CLASS (mode) == MODE_INT
7358             && CONST_INT_P (op1)
7359             && aarch64_uimm12_shift (INTVAL (op1)))
7360           {
7361             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7362
7363             if (speed)
7364               /* ADD (immediate).  */
7365               *cost += extra_cost->alu.arith;
7366             return true;
7367           }
7368
7369         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7370
7371         /* Look for ADD (extended register).  */
7372         if (is_a <scalar_int_mode> (mode, &int_mode)
7373             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7374           {
7375             if (speed)
7376               *cost += extra_cost->alu.extend_arith;
7377
7378             op0 = aarch64_strip_extend (op0, true);
7379             *cost += rtx_cost (op0, VOIDmode,
7380                                (enum rtx_code) GET_CODE (op0), 0, speed);
7381             return true;
7382           }
7383
7384         /* Strip any extend, leave shifts behind as we will
7385            cost them through mult_cost.  */
7386         new_op0 = aarch64_strip_extend (op0, false);
7387
7388         if (GET_CODE (new_op0) == MULT
7389             || aarch64_shift_p (GET_CODE (new_op0)))
7390           {
7391             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7392                                             speed);
7393             return true;
7394           }
7395
7396         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7397
7398         if (speed)
7399           {
7400             if (VECTOR_MODE_P (mode))
7401               {
7402                 /* Vector ADD.  */
7403                 *cost += extra_cost->vect.alu;
7404               }
7405             else if (GET_MODE_CLASS (mode) == MODE_INT)
7406               {
7407                 /* ADD.  */
7408                 *cost += extra_cost->alu.arith;
7409               }
7410             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7411               {
7412                 /* FADD.  */
7413                 *cost += extra_cost->fp[mode == DFmode].addsub;
7414               }
7415           }
7416         return true;
7417       }
7418
7419     case BSWAP:
7420       *cost = COSTS_N_INSNS (1);
7421
7422       if (speed)
7423         {
7424           if (VECTOR_MODE_P (mode))
7425             *cost += extra_cost->vect.alu;
7426           else
7427             *cost += extra_cost->alu.rev;
7428         }
7429       return false;
7430
7431     case IOR:
7432       if (aarch_rev16_p (x))
7433         {
7434           *cost = COSTS_N_INSNS (1);
7435
7436           if (speed)
7437             {
7438               if (VECTOR_MODE_P (mode))
7439                 *cost += extra_cost->vect.alu;
7440               else
7441                 *cost += extra_cost->alu.rev;
7442             }
7443           return true;
7444         }
7445
7446       if (aarch64_extr_rtx_p (x, &op0, &op1))
7447         {
7448           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7449           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7450           if (speed)
7451             *cost += extra_cost->alu.shift;
7452
7453           return true;
7454         }
7455     /* Fall through.  */
7456     case XOR:
7457     case AND:
7458     cost_logic:
7459       op0 = XEXP (x, 0);
7460       op1 = XEXP (x, 1);
7461
7462       if (VECTOR_MODE_P (mode))
7463         {
7464           if (speed)
7465             *cost += extra_cost->vect.alu;
7466           return true;
7467         }
7468
7469       if (code == AND
7470           && GET_CODE (op0) == MULT
7471           && CONST_INT_P (XEXP (op0, 1))
7472           && CONST_INT_P (op1)
7473           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7474                                INTVAL (op1)) != 0)
7475         {
7476           /* This is a UBFM/SBFM.  */
7477           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7478           if (speed)
7479             *cost += extra_cost->alu.bfx;
7480           return true;
7481         }
7482
7483       if (is_int_mode (mode, &int_mode))
7484         {
7485           if (CONST_INT_P (op1))
7486             {
7487               /* We have a mask + shift version of a UBFIZ
7488                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7489               if (GET_CODE (op0) == ASHIFT
7490                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7491                                                          XEXP (op0, 1)))
7492                 {
7493                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7494                                      (enum rtx_code) code, 0, speed);
7495                   if (speed)
7496                     *cost += extra_cost->alu.bfx;
7497
7498                   return true;
7499                 }
7500               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7501                 {
7502                 /* We possibly get the immediate for free, this is not
7503                    modelled.  */
7504                   *cost += rtx_cost (op0, int_mode,
7505                                      (enum rtx_code) code, 0, speed);
7506                   if (speed)
7507                     *cost += extra_cost->alu.logical;
7508
7509                   return true;
7510                 }
7511             }
7512           else
7513             {
7514               rtx new_op0 = op0;
7515
7516               /* Handle ORN, EON, or BIC.  */
7517               if (GET_CODE (op0) == NOT)
7518                 op0 = XEXP (op0, 0);
7519
7520               new_op0 = aarch64_strip_shift (op0);
7521
7522               /* If we had a shift on op0 then this is a logical-shift-
7523                  by-register/immediate operation.  Otherwise, this is just
7524                  a logical operation.  */
7525               if (speed)
7526                 {
7527                   if (new_op0 != op0)
7528                     {
7529                       /* Shift by immediate.  */
7530                       if (CONST_INT_P (XEXP (op0, 1)))
7531                         *cost += extra_cost->alu.log_shift;
7532                       else
7533                         *cost += extra_cost->alu.log_shift_reg;
7534                     }
7535                   else
7536                     *cost += extra_cost->alu.logical;
7537                 }
7538
7539               /* In both cases we want to cost both operands.  */
7540               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7541                                  0, speed);
7542               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7543                                  1, speed);
7544
7545               return true;
7546             }
7547         }
7548       return false;
7549
7550     case NOT:
7551       x = XEXP (x, 0);
7552       op0 = aarch64_strip_shift (x);
7553
7554       if (VECTOR_MODE_P (mode))
7555         {
7556           /* Vector NOT.  */
7557           *cost += extra_cost->vect.alu;
7558           return false;
7559         }
7560
7561       /* MVN-shifted-reg.  */
7562       if (op0 != x)
7563         {
7564           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7565
7566           if (speed)
7567             *cost += extra_cost->alu.log_shift;
7568
7569           return true;
7570         }
7571       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7572          Handle the second form here taking care that 'a' in the above can
7573          be a shift.  */
7574       else if (GET_CODE (op0) == XOR)
7575         {
7576           rtx newop0 = XEXP (op0, 0);
7577           rtx newop1 = XEXP (op0, 1);
7578           rtx op0_stripped = aarch64_strip_shift (newop0);
7579
7580           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7581           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7582
7583           if (speed)
7584             {
7585               if (op0_stripped != newop0)
7586                 *cost += extra_cost->alu.log_shift;
7587               else
7588                 *cost += extra_cost->alu.logical;
7589             }
7590
7591           return true;
7592         }
7593       /* MVN.  */
7594       if (speed)
7595         *cost += extra_cost->alu.logical;
7596
7597       return false;
7598
7599     case ZERO_EXTEND:
7600
7601       op0 = XEXP (x, 0);
7602       /* If a value is written in SI mode, then zero extended to DI
7603          mode, the operation will in general be free as a write to
7604          a 'w' register implicitly zeroes the upper bits of an 'x'
7605          register.  However, if this is
7606
7607            (set (reg) (zero_extend (reg)))
7608
7609          we must cost the explicit register move.  */
7610       if (mode == DImode
7611           && GET_MODE (op0) == SImode
7612           && outer == SET)
7613         {
7614           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7615
7616         /* If OP_COST is non-zero, then the cost of the zero extend
7617            is effectively the cost of the inner operation.  Otherwise
7618            we have a MOV instruction and we take the cost from the MOV
7619            itself.  This is true independently of whether we are
7620            optimizing for space or time.  */
7621           if (op_cost)
7622             *cost = op_cost;
7623
7624           return true;
7625         }
7626       else if (MEM_P (op0))
7627         {
7628           /* All loads can zero extend to any size for free.  */
7629           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7630           return true;
7631         }
7632
7633       op0 = aarch64_extend_bitfield_pattern_p (x);
7634       if (op0)
7635         {
7636           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7637           if (speed)
7638             *cost += extra_cost->alu.bfx;
7639           return true;
7640         }
7641
7642       if (speed)
7643         {
7644           if (VECTOR_MODE_P (mode))
7645             {
7646               /* UMOV.  */
7647               *cost += extra_cost->vect.alu;
7648             }
7649           else
7650             {
7651               /* We generate an AND instead of UXTB/UXTH.  */
7652               *cost += extra_cost->alu.logical;
7653             }
7654         }
7655       return false;
7656
7657     case SIGN_EXTEND:
7658       if (MEM_P (XEXP (x, 0)))
7659         {
7660           /* LDRSH.  */
7661           if (speed)
7662             {
7663               rtx address = XEXP (XEXP (x, 0), 0);
7664               *cost += extra_cost->ldst.load_sign_extend;
7665
7666               *cost +=
7667                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7668                                                      0, speed));
7669             }
7670           return true;
7671         }
7672
7673       op0 = aarch64_extend_bitfield_pattern_p (x);
7674       if (op0)
7675         {
7676           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7677           if (speed)
7678             *cost += extra_cost->alu.bfx;
7679           return true;
7680         }
7681
7682       if (speed)
7683         {
7684           if (VECTOR_MODE_P (mode))
7685             *cost += extra_cost->vect.alu;
7686           else
7687             *cost += extra_cost->alu.extend;
7688         }
7689       return false;
7690
7691     case ASHIFT:
7692       op0 = XEXP (x, 0);
7693       op1 = XEXP (x, 1);
7694
7695       if (CONST_INT_P (op1))
7696         {
7697           if (speed)
7698             {
7699               if (VECTOR_MODE_P (mode))
7700                 {
7701                   /* Vector shift (immediate).  */
7702                   *cost += extra_cost->vect.alu;
7703                 }
7704               else
7705                 {
7706                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7707                      aliases.  */
7708                   *cost += extra_cost->alu.shift;
7709                 }
7710             }
7711
7712           /* We can incorporate zero/sign extend for free.  */
7713           if (GET_CODE (op0) == ZERO_EXTEND
7714               || GET_CODE (op0) == SIGN_EXTEND)
7715             op0 = XEXP (op0, 0);
7716
7717           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7718           return true;
7719         }
7720       else
7721         {
7722           if (VECTOR_MODE_P (mode))
7723             {
7724               if (speed)
7725                 /* Vector shift (register).  */
7726                 *cost += extra_cost->vect.alu;
7727             }
7728           else
7729             {
7730               if (speed)
7731                 /* LSLV.  */
7732                 *cost += extra_cost->alu.shift_reg;
7733
7734               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7735                   && CONST_INT_P (XEXP (op1, 1))
7736                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7737                 {
7738                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7739                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7740                      don't recurse into it.  */
7741                   return true;
7742                 }
7743             }
7744           return false;  /* All arguments need to be in registers.  */
7745         }
7746
7747     case ROTATE:
7748     case ROTATERT:
7749     case LSHIFTRT:
7750     case ASHIFTRT:
7751       op0 = XEXP (x, 0);
7752       op1 = XEXP (x, 1);
7753
7754       if (CONST_INT_P (op1))
7755         {
7756           /* ASR (immediate) and friends.  */
7757           if (speed)
7758             {
7759               if (VECTOR_MODE_P (mode))
7760                 *cost += extra_cost->vect.alu;
7761               else
7762                 *cost += extra_cost->alu.shift;
7763             }
7764
7765           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7766           return true;
7767         }
7768       else
7769         {
7770           if (VECTOR_MODE_P (mode))
7771             {
7772               if (speed)
7773                 /* Vector shift (register).  */
7774                 *cost += extra_cost->vect.alu;
7775             }
7776           else
7777             {
7778               if (speed)
7779                 /* ASR (register) and friends.  */
7780                 *cost += extra_cost->alu.shift_reg;
7781
7782               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7783                   && CONST_INT_P (XEXP (op1, 1))
7784                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7785                 {
7786                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7787                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7788                      don't recurse into it.  */
7789                   return true;
7790                 }
7791             }
7792           return false;  /* All arguments need to be in registers.  */
7793         }
7794
7795     case SYMBOL_REF:
7796
7797       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7798           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7799         {
7800           /* LDR.  */
7801           if (speed)
7802             *cost += extra_cost->ldst.load;
7803         }
7804       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7805                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7806         {
7807           /* ADRP, followed by ADD.  */
7808           *cost += COSTS_N_INSNS (1);
7809           if (speed)
7810             *cost += 2 * extra_cost->alu.arith;
7811         }
7812       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7813                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7814         {
7815           /* ADR.  */
7816           if (speed)
7817             *cost += extra_cost->alu.arith;
7818         }
7819
7820       if (flag_pic)
7821         {
7822           /* One extra load instruction, after accessing the GOT.  */
7823           *cost += COSTS_N_INSNS (1);
7824           if (speed)
7825             *cost += extra_cost->ldst.load;
7826         }
7827       return true;
7828
7829     case HIGH:
7830     case LO_SUM:
7831       /* ADRP/ADD (immediate).  */
7832       if (speed)
7833         *cost += extra_cost->alu.arith;
7834       return true;
7835
7836     case ZERO_EXTRACT:
7837     case SIGN_EXTRACT:
7838       /* UBFX/SBFX.  */
7839       if (speed)
7840         {
7841           if (VECTOR_MODE_P (mode))
7842             *cost += extra_cost->vect.alu;
7843           else
7844             *cost += extra_cost->alu.bfx;
7845         }
7846
7847       /* We can trust that the immediates used will be correct (there
7848          are no by-register forms), so we need only cost op0.  */
7849       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7850       return true;
7851
7852     case MULT:
7853       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7854       /* aarch64_rtx_mult_cost always handles recursion to its
7855          operands.  */
7856       return true;
7857
7858     case MOD:
7859     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7860        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7861        an unconditional negate.  This case should only ever be reached through
7862        the set_smod_pow2_cheap check in expmed.c.  */
7863       if (CONST_INT_P (XEXP (x, 1))
7864           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7865           && (mode == SImode || mode == DImode))
7866         {
7867           /* We expand to 4 instructions.  Reset the baseline.  */
7868           *cost = COSTS_N_INSNS (4);
7869
7870           if (speed)
7871             *cost += 2 * extra_cost->alu.logical
7872                      + 2 * extra_cost->alu.arith;
7873
7874           return true;
7875         }
7876
7877     /* Fall-through.  */
7878     case UMOD:
7879       if (speed)
7880         {
7881           /* Slighly prefer UMOD over SMOD.  */
7882           if (VECTOR_MODE_P (mode))
7883             *cost += extra_cost->vect.alu;
7884           else if (GET_MODE_CLASS (mode) == MODE_INT)
7885             *cost += (extra_cost->mult[mode == DImode].add
7886                       + extra_cost->mult[mode == DImode].idiv
7887                       + (code == MOD ? 1 : 0));
7888         }
7889       return false;  /* All arguments need to be in registers.  */
7890
7891     case DIV:
7892     case UDIV:
7893     case SQRT:
7894       if (speed)
7895         {
7896           if (VECTOR_MODE_P (mode))
7897             *cost += extra_cost->vect.alu;
7898           else if (GET_MODE_CLASS (mode) == MODE_INT)
7899             /* There is no integer SQRT, so only DIV and UDIV can get
7900                here.  */
7901             *cost += (extra_cost->mult[mode == DImode].idiv
7902                      /* Slighly prefer UDIV over SDIV.  */
7903                      + (code == DIV ? 1 : 0));
7904           else
7905             *cost += extra_cost->fp[mode == DFmode].div;
7906         }
7907       return false;  /* All arguments need to be in registers.  */
7908
7909     case IF_THEN_ELSE:
7910       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7911                                          XEXP (x, 2), cost, speed);
7912
7913     case EQ:
7914     case NE:
7915     case GT:
7916     case GTU:
7917     case LT:
7918     case LTU:
7919     case GE:
7920     case GEU:
7921     case LE:
7922     case LEU:
7923
7924       return false; /* All arguments must be in registers.  */
7925
7926     case FMA:
7927       op0 = XEXP (x, 0);
7928       op1 = XEXP (x, 1);
7929       op2 = XEXP (x, 2);
7930
7931       if (speed)
7932         {
7933           if (VECTOR_MODE_P (mode))
7934             *cost += extra_cost->vect.alu;
7935           else
7936             *cost += extra_cost->fp[mode == DFmode].fma;
7937         }
7938
7939       /* FMSUB, FNMADD, and FNMSUB are free.  */
7940       if (GET_CODE (op0) == NEG)
7941         op0 = XEXP (op0, 0);
7942
7943       if (GET_CODE (op2) == NEG)
7944         op2 = XEXP (op2, 0);
7945
7946       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7947          and the by-element operand as operand 0.  */
7948       if (GET_CODE (op1) == NEG)
7949         op1 = XEXP (op1, 0);
7950
7951       /* Catch vector-by-element operations.  The by-element operand can
7952          either be (vec_duplicate (vec_select (x))) or just
7953          (vec_select (x)), depending on whether we are multiplying by
7954          a vector or a scalar.
7955
7956          Canonicalization is not very good in these cases, FMA4 will put the
7957          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7958       if (GET_CODE (op0) == VEC_DUPLICATE)
7959         op0 = XEXP (op0, 0);
7960       else if (GET_CODE (op1) == VEC_DUPLICATE)
7961         op1 = XEXP (op1, 0);
7962
7963       if (GET_CODE (op0) == VEC_SELECT)
7964         op0 = XEXP (op0, 0);
7965       else if (GET_CODE (op1) == VEC_SELECT)
7966         op1 = XEXP (op1, 0);
7967
7968       /* If the remaining parameters are not registers,
7969          get the cost to put them into registers.  */
7970       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7971       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7972       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7973       return true;
7974
7975     case FLOAT:
7976     case UNSIGNED_FLOAT:
7977       if (speed)
7978         *cost += extra_cost->fp[mode == DFmode].fromint;
7979       return false;
7980
7981     case FLOAT_EXTEND:
7982       if (speed)
7983         {
7984           if (VECTOR_MODE_P (mode))
7985             {
7986               /*Vector truncate.  */
7987               *cost += extra_cost->vect.alu;
7988             }
7989           else
7990             *cost += extra_cost->fp[mode == DFmode].widen;
7991         }
7992       return false;
7993
7994     case FLOAT_TRUNCATE:
7995       if (speed)
7996         {
7997           if (VECTOR_MODE_P (mode))
7998             {
7999               /*Vector conversion.  */
8000               *cost += extra_cost->vect.alu;
8001             }
8002           else
8003             *cost += extra_cost->fp[mode == DFmode].narrow;
8004         }
8005       return false;
8006
8007     case FIX:
8008     case UNSIGNED_FIX:
8009       x = XEXP (x, 0);
8010       /* Strip the rounding part.  They will all be implemented
8011          by the fcvt* family of instructions anyway.  */
8012       if (GET_CODE (x) == UNSPEC)
8013         {
8014           unsigned int uns_code = XINT (x, 1);
8015
8016           if (uns_code == UNSPEC_FRINTA
8017               || uns_code == UNSPEC_FRINTM
8018               || uns_code == UNSPEC_FRINTN
8019               || uns_code == UNSPEC_FRINTP
8020               || uns_code == UNSPEC_FRINTZ)
8021             x = XVECEXP (x, 0, 0);
8022         }
8023
8024       if (speed)
8025         {
8026           if (VECTOR_MODE_P (mode))
8027             *cost += extra_cost->vect.alu;
8028           else
8029             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8030         }
8031
8032       /* We can combine fmul by a power of 2 followed by a fcvt into a single
8033          fixed-point fcvt.  */
8034       if (GET_CODE (x) == MULT
8035           && ((VECTOR_MODE_P (mode)
8036                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8037               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8038         {
8039           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8040                              0, speed);
8041           return true;
8042         }
8043
8044       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8045       return true;
8046
8047     case ABS:
8048       if (VECTOR_MODE_P (mode))
8049         {
8050           /* ABS (vector).  */
8051           if (speed)
8052             *cost += extra_cost->vect.alu;
8053         }
8054       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8055         {
8056           op0 = XEXP (x, 0);
8057
8058           /* FABD, which is analogous to FADD.  */
8059           if (GET_CODE (op0) == MINUS)
8060             {
8061               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8062               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8063               if (speed)
8064                 *cost += extra_cost->fp[mode == DFmode].addsub;
8065
8066               return true;
8067             }
8068           /* Simple FABS is analogous to FNEG.  */
8069           if (speed)
8070             *cost += extra_cost->fp[mode == DFmode].neg;
8071         }
8072       else
8073         {
8074           /* Integer ABS will either be split to
8075              two arithmetic instructions, or will be an ABS
8076              (scalar), which we don't model.  */
8077           *cost = COSTS_N_INSNS (2);
8078           if (speed)
8079             *cost += 2 * extra_cost->alu.arith;
8080         }
8081       return false;
8082
8083     case SMAX:
8084     case SMIN:
8085       if (speed)
8086         {
8087           if (VECTOR_MODE_P (mode))
8088             *cost += extra_cost->vect.alu;
8089           else
8090             {
8091               /* FMAXNM/FMINNM/FMAX/FMIN.
8092                  TODO: This may not be accurate for all implementations, but
8093                  we do not model this in the cost tables.  */
8094               *cost += extra_cost->fp[mode == DFmode].addsub;
8095             }
8096         }
8097       return false;
8098
8099     case UNSPEC:
8100       /* The floating point round to integer frint* instructions.  */
8101       if (aarch64_frint_unspec_p (XINT (x, 1)))
8102         {
8103           if (speed)
8104             *cost += extra_cost->fp[mode == DFmode].roundint;
8105
8106           return false;
8107         }
8108
8109       if (XINT (x, 1) == UNSPEC_RBIT)
8110         {
8111           if (speed)
8112             *cost += extra_cost->alu.rev;
8113
8114           return false;
8115         }
8116       break;
8117
8118     case TRUNCATE:
8119
8120       /* Decompose <su>muldi3_highpart.  */
8121       if (/* (truncate:DI  */
8122           mode == DImode
8123           /*   (lshiftrt:TI  */
8124           && GET_MODE (XEXP (x, 0)) == TImode
8125           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8126           /*      (mult:TI  */
8127           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8128           /*        (ANY_EXTEND:TI (reg:DI))
8129                     (ANY_EXTEND:TI (reg:DI)))  */
8130           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8131                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8132               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8133                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8134           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8135           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8136           /*     (const_int 64)  */
8137           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8138           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8139         {
8140           /* UMULH/SMULH.  */
8141           if (speed)
8142             *cost += extra_cost->mult[mode == DImode].extend;
8143           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8144                              mode, MULT, 0, speed);
8145           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8146                              mode, MULT, 1, speed);
8147           return true;
8148         }
8149
8150       /* Fall through.  */
8151     default:
8152       break;
8153     }
8154
8155   if (dump_file
8156       && flag_aarch64_verbose_cost)
8157     fprintf (dump_file,
8158       "\nFailed to cost RTX.  Assuming default cost.\n");
8159
8160   return true;
8161 }
8162
8163 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8164    calculated for X.  This cost is stored in *COST.  Returns true
8165    if the total cost of X was calculated.  */
8166 static bool
8167 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8168                    int param, int *cost, bool speed)
8169 {
8170   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8171
8172   if (dump_file
8173       && flag_aarch64_verbose_cost)
8174     {
8175       print_rtl_single (dump_file, x);
8176       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8177                speed ? "Hot" : "Cold",
8178                *cost, result ? "final" : "partial");
8179     }
8180
8181   return result;
8182 }
8183
8184 static int
8185 aarch64_register_move_cost (machine_mode mode,
8186                             reg_class_t from_i, reg_class_t to_i)
8187 {
8188   enum reg_class from = (enum reg_class) from_i;
8189   enum reg_class to = (enum reg_class) to_i;
8190   const struct cpu_regmove_cost *regmove_cost
8191     = aarch64_tune_params.regmove_cost;
8192
8193   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8194   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8195     to = GENERAL_REGS;
8196
8197   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8198     from = GENERAL_REGS;
8199
8200   /* Moving between GPR and stack cost is the same as GP2GP.  */
8201   if ((from == GENERAL_REGS && to == STACK_REG)
8202       || (to == GENERAL_REGS && from == STACK_REG))
8203     return regmove_cost->GP2GP;
8204
8205   /* To/From the stack register, we move via the gprs.  */
8206   if (to == STACK_REG || from == STACK_REG)
8207     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8208             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8209
8210   if (GET_MODE_SIZE (mode) == 16)
8211     {
8212       /* 128-bit operations on general registers require 2 instructions.  */
8213       if (from == GENERAL_REGS && to == GENERAL_REGS)
8214         return regmove_cost->GP2GP * 2;
8215       else if (from == GENERAL_REGS)
8216         return regmove_cost->GP2FP * 2;
8217       else if (to == GENERAL_REGS)
8218         return regmove_cost->FP2GP * 2;
8219
8220       /* When AdvSIMD instructions are disabled it is not possible to move
8221          a 128-bit value directly between Q registers.  This is handled in
8222          secondary reload.  A general register is used as a scratch to move
8223          the upper DI value and the lower DI value is moved directly,
8224          hence the cost is the sum of three moves. */
8225       if (! TARGET_SIMD)
8226         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8227
8228       return regmove_cost->FP2FP;
8229     }
8230
8231   if (from == GENERAL_REGS && to == GENERAL_REGS)
8232     return regmove_cost->GP2GP;
8233   else if (from == GENERAL_REGS)
8234     return regmove_cost->GP2FP;
8235   else if (to == GENERAL_REGS)
8236     return regmove_cost->FP2GP;
8237
8238   return regmove_cost->FP2FP;
8239 }
8240
8241 static int
8242 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8243                           reg_class_t rclass ATTRIBUTE_UNUSED,
8244                           bool in ATTRIBUTE_UNUSED)
8245 {
8246   return aarch64_tune_params.memmov_cost;
8247 }
8248
8249 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8250    to optimize 1.0/sqrt.  */
8251
8252 static bool
8253 use_rsqrt_p (machine_mode mode)
8254 {
8255   return (!flag_trapping_math
8256           && flag_unsafe_math_optimizations
8257           && ((aarch64_tune_params.approx_modes->recip_sqrt
8258                & AARCH64_APPROX_MODE (mode))
8259               || flag_mrecip_low_precision_sqrt));
8260 }
8261
8262 /* Function to decide when to use the approximate reciprocal square root
8263    builtin.  */
8264
8265 static tree
8266 aarch64_builtin_reciprocal (tree fndecl)
8267 {
8268   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8269
8270   if (!use_rsqrt_p (mode))
8271     return NULL_TREE;
8272   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8273 }
8274
8275 typedef rtx (*rsqrte_type) (rtx, rtx);
8276
8277 /* Select reciprocal square root initial estimate insn depending on machine
8278    mode.  */
8279
8280 static rsqrte_type
8281 get_rsqrte_type (machine_mode mode)
8282 {
8283   switch (mode)
8284   {
8285     case E_DFmode:   return gen_aarch64_rsqrtedf;
8286     case E_SFmode:   return gen_aarch64_rsqrtesf;
8287     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8288     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8289     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8290     default: gcc_unreachable ();
8291   }
8292 }
8293
8294 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8295
8296 /* Select reciprocal square root series step insn depending on machine mode.  */
8297
8298 static rsqrts_type
8299 get_rsqrts_type (machine_mode mode)
8300 {
8301   switch (mode)
8302   {
8303     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8304     case E_SFmode:   return gen_aarch64_rsqrtssf;
8305     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8306     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8307     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8308     default: gcc_unreachable ();
8309   }
8310 }
8311
8312 /* Emit instruction sequence to compute either the approximate square root
8313    or its approximate reciprocal, depending on the flag RECP, and return
8314    whether the sequence was emitted or not.  */
8315
8316 bool
8317 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8318 {
8319   machine_mode mode = GET_MODE (dst);
8320
8321   if (GET_MODE_INNER (mode) == HFmode)
8322     {
8323       gcc_assert (!recp);
8324       return false;
8325     }
8326
8327   if (!recp)
8328     {
8329       if (!(flag_mlow_precision_sqrt
8330             || (aarch64_tune_params.approx_modes->sqrt
8331                 & AARCH64_APPROX_MODE (mode))))
8332         return false;
8333
8334       if (flag_finite_math_only
8335           || flag_trapping_math
8336           || !flag_unsafe_math_optimizations
8337           || optimize_function_for_size_p (cfun))
8338         return false;
8339     }
8340   else
8341     /* Caller assumes we cannot fail.  */
8342     gcc_assert (use_rsqrt_p (mode));
8343
8344   machine_mode mmsk = mode_for_int_vector (mode).require ();
8345   rtx xmsk = gen_reg_rtx (mmsk);
8346   if (!recp)
8347     /* When calculating the approximate square root, compare the
8348        argument with 0.0 and create a mask.  */
8349     emit_insn (gen_rtx_SET (xmsk,
8350                             gen_rtx_NEG (mmsk,
8351                                          gen_rtx_EQ (mmsk, src,
8352                                                      CONST0_RTX (mode)))));
8353
8354   /* Estimate the approximate reciprocal square root.  */
8355   rtx xdst = gen_reg_rtx (mode);
8356   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8357
8358   /* Iterate over the series twice for SF and thrice for DF.  */
8359   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8360
8361   /* Optionally iterate over the series once less for faster performance
8362      while sacrificing the accuracy.  */
8363   if ((recp && flag_mrecip_low_precision_sqrt)
8364       || (!recp && flag_mlow_precision_sqrt))
8365     iterations--;
8366
8367   /* Iterate over the series to calculate the approximate reciprocal square
8368      root.  */
8369   rtx x1 = gen_reg_rtx (mode);
8370   while (iterations--)
8371     {
8372       rtx x2 = gen_reg_rtx (mode);
8373       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8374
8375       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8376
8377       if (iterations > 0)
8378         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8379     }
8380
8381   if (!recp)
8382     {
8383       /* Qualify the approximate reciprocal square root when the argument is
8384          0.0 by squashing the intermediary result to 0.0.  */
8385       rtx xtmp = gen_reg_rtx (mmsk);
8386       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8387                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8388       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8389
8390       /* Calculate the approximate square root.  */
8391       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8392     }
8393
8394   /* Finalize the approximation.  */
8395   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8396
8397   return true;
8398 }
8399
8400 typedef rtx (*recpe_type) (rtx, rtx);
8401
8402 /* Select reciprocal initial estimate insn depending on machine mode.  */
8403
8404 static recpe_type
8405 get_recpe_type (machine_mode mode)
8406 {
8407   switch (mode)
8408   {
8409     case E_SFmode:   return (gen_aarch64_frecpesf);
8410     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8411     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8412     case E_DFmode:   return (gen_aarch64_frecpedf);
8413     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8414     default:         gcc_unreachable ();
8415   }
8416 }
8417
8418 typedef rtx (*recps_type) (rtx, rtx, rtx);
8419
8420 /* Select reciprocal series step insn depending on machine mode.  */
8421
8422 static recps_type
8423 get_recps_type (machine_mode mode)
8424 {
8425   switch (mode)
8426   {
8427     case E_SFmode:   return (gen_aarch64_frecpssf);
8428     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8429     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8430     case E_DFmode:   return (gen_aarch64_frecpsdf);
8431     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8432     default:         gcc_unreachable ();
8433   }
8434 }
8435
8436 /* Emit the instruction sequence to compute the approximation for the division
8437    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8438
8439 bool
8440 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8441 {
8442   machine_mode mode = GET_MODE (quo);
8443
8444   if (GET_MODE_INNER (mode) == HFmode)
8445     return false;
8446
8447   bool use_approx_division_p = (flag_mlow_precision_div
8448                                 || (aarch64_tune_params.approx_modes->division
8449                                     & AARCH64_APPROX_MODE (mode)));
8450
8451   if (!flag_finite_math_only
8452       || flag_trapping_math
8453       || !flag_unsafe_math_optimizations
8454       || optimize_function_for_size_p (cfun)
8455       || !use_approx_division_p)
8456     return false;
8457
8458   /* Estimate the approximate reciprocal.  */
8459   rtx xrcp = gen_reg_rtx (mode);
8460   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8461
8462   /* Iterate over the series twice for SF and thrice for DF.  */
8463   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8464
8465   /* Optionally iterate over the series once less for faster performance,
8466      while sacrificing the accuracy.  */
8467   if (flag_mlow_precision_div)
8468     iterations--;
8469
8470   /* Iterate over the series to calculate the approximate reciprocal.  */
8471   rtx xtmp = gen_reg_rtx (mode);
8472   while (iterations--)
8473     {
8474       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8475
8476       if (iterations > 0)
8477         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8478     }
8479
8480   if (num != CONST1_RTX (mode))
8481     {
8482       /* As the approximate reciprocal of DEN is already calculated, only
8483          calculate the approximate division when NUM is not 1.0.  */
8484       rtx xnum = force_reg (mode, num);
8485       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8486     }
8487
8488   /* Finalize the approximation.  */
8489   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8490   return true;
8491 }
8492
8493 /* Return the number of instructions that can be issued per cycle.  */
8494 static int
8495 aarch64_sched_issue_rate (void)
8496 {
8497   return aarch64_tune_params.issue_rate;
8498 }
8499
8500 static int
8501 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8502 {
8503   int issue_rate = aarch64_sched_issue_rate ();
8504
8505   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8506 }
8507
8508
8509 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8510    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8511    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8512
8513 static int
8514 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8515                                                     int ready_index)
8516 {
8517   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8518 }
8519
8520
8521 /* Vectorizer cost model target hooks.  */
8522
8523 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8524 static int
8525 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8526                                     tree vectype,
8527                                     int misalign ATTRIBUTE_UNUSED)
8528 {
8529   unsigned elements;
8530   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8531   bool fp = false;
8532
8533   if (vectype != NULL)
8534     fp = FLOAT_TYPE_P (vectype);
8535
8536   switch (type_of_cost)
8537     {
8538       case scalar_stmt:
8539         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8540
8541       case scalar_load:
8542         return costs->scalar_load_cost;
8543
8544       case scalar_store:
8545         return costs->scalar_store_cost;
8546
8547       case vector_stmt:
8548         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8549
8550       case vector_load:
8551         return costs->vec_align_load_cost;
8552
8553       case vector_store:
8554         return costs->vec_store_cost;
8555
8556       case vec_to_scalar:
8557         return costs->vec_to_scalar_cost;
8558
8559       case scalar_to_vec:
8560         return costs->scalar_to_vec_cost;
8561
8562       case unaligned_load:
8563       case vector_gather_load:
8564         return costs->vec_unalign_load_cost;
8565
8566       case unaligned_store:
8567       case vector_scatter_store:
8568         return costs->vec_unalign_store_cost;
8569
8570       case cond_branch_taken:
8571         return costs->cond_taken_branch_cost;
8572
8573       case cond_branch_not_taken:
8574         return costs->cond_not_taken_branch_cost;
8575
8576       case vec_perm:
8577         return costs->vec_permute_cost;
8578
8579       case vec_promote_demote:
8580         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8581
8582       case vec_construct:
8583         elements = TYPE_VECTOR_SUBPARTS (vectype);
8584         return elements / 2 + 1;
8585
8586       default:
8587         gcc_unreachable ();
8588     }
8589 }
8590
8591 /* Implement targetm.vectorize.add_stmt_cost.  */
8592 static unsigned
8593 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8594                        struct _stmt_vec_info *stmt_info, int misalign,
8595                        enum vect_cost_model_location where)
8596 {
8597   unsigned *cost = (unsigned *) data;
8598   unsigned retval = 0;
8599
8600   if (flag_vect_cost_model)
8601     {
8602       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8603       int stmt_cost =
8604             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8605
8606       /* Statements in an inner loop relative to the loop being
8607          vectorized are weighted more heavily.  The value here is
8608          arbitrary and could potentially be improved with analysis.  */
8609       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8610         count *= 50; /*  FIXME  */
8611
8612       retval = (unsigned) (count * stmt_cost);
8613       cost[where] += retval;
8614     }
8615
8616   return retval;
8617 }
8618
8619 static void initialize_aarch64_code_model (struct gcc_options *);
8620
8621 /* Parse the TO_PARSE string and put the architecture struct that it
8622    selects into RES and the architectural features into ISA_FLAGS.
8623    Return an aarch64_parse_opt_result describing the parse result.
8624    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8625
8626 static enum aarch64_parse_opt_result
8627 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8628                     unsigned long *isa_flags)
8629 {
8630   char *ext;
8631   const struct processor *arch;
8632   char *str = (char *) alloca (strlen (to_parse) + 1);
8633   size_t len;
8634
8635   strcpy (str, to_parse);
8636
8637   ext = strchr (str, '+');
8638
8639   if (ext != NULL)
8640     len = ext - str;
8641   else
8642     len = strlen (str);
8643
8644   if (len == 0)
8645     return AARCH64_PARSE_MISSING_ARG;
8646
8647
8648   /* Loop through the list of supported ARCHes to find a match.  */
8649   for (arch = all_architectures; arch->name != NULL; arch++)
8650     {
8651       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8652         {
8653           unsigned long isa_temp = arch->flags;
8654
8655           if (ext != NULL)
8656             {
8657               /* TO_PARSE string contains at least one extension.  */
8658               enum aarch64_parse_opt_result ext_res
8659                 = aarch64_parse_extension (ext, &isa_temp);
8660
8661               if (ext_res != AARCH64_PARSE_OK)
8662                 return ext_res;
8663             }
8664           /* Extension parsing was successful.  Confirm the result
8665              arch and ISA flags.  */
8666           *res = arch;
8667           *isa_flags = isa_temp;
8668           return AARCH64_PARSE_OK;
8669         }
8670     }
8671
8672   /* ARCH name not found in list.  */
8673   return AARCH64_PARSE_INVALID_ARG;
8674 }
8675
8676 /* Parse the TO_PARSE string and put the result tuning in RES and the
8677    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8678    describing the parse result.  If there is an error parsing, RES and
8679    ISA_FLAGS are left unchanged.  */
8680
8681 static enum aarch64_parse_opt_result
8682 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8683                    unsigned long *isa_flags)
8684 {
8685   char *ext;
8686   const struct processor *cpu;
8687   char *str = (char *) alloca (strlen (to_parse) + 1);
8688   size_t len;
8689
8690   strcpy (str, to_parse);
8691
8692   ext = strchr (str, '+');
8693
8694   if (ext != NULL)
8695     len = ext - str;
8696   else
8697     len = strlen (str);
8698
8699   if (len == 0)
8700     return AARCH64_PARSE_MISSING_ARG;
8701
8702
8703   /* Loop through the list of supported CPUs to find a match.  */
8704   for (cpu = all_cores; cpu->name != NULL; cpu++)
8705     {
8706       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8707         {
8708           unsigned long isa_temp = cpu->flags;
8709
8710
8711           if (ext != NULL)
8712             {
8713               /* TO_PARSE string contains at least one extension.  */
8714               enum aarch64_parse_opt_result ext_res
8715                 = aarch64_parse_extension (ext, &isa_temp);
8716
8717               if (ext_res != AARCH64_PARSE_OK)
8718                 return ext_res;
8719             }
8720           /* Extension parsing was successfull.  Confirm the result
8721              cpu and ISA flags.  */
8722           *res = cpu;
8723           *isa_flags = isa_temp;
8724           return AARCH64_PARSE_OK;
8725         }
8726     }
8727
8728   /* CPU name not found in list.  */
8729   return AARCH64_PARSE_INVALID_ARG;
8730 }
8731
8732 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8733    Return an aarch64_parse_opt_result describing the parse result.
8734    If the parsing fails the RES does not change.  */
8735
8736 static enum aarch64_parse_opt_result
8737 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8738 {
8739   const struct processor *cpu;
8740   char *str = (char *) alloca (strlen (to_parse) + 1);
8741
8742   strcpy (str, to_parse);
8743
8744   /* Loop through the list of supported CPUs to find a match.  */
8745   for (cpu = all_cores; cpu->name != NULL; cpu++)
8746     {
8747       if (strcmp (cpu->name, str) == 0)
8748         {
8749           *res = cpu;
8750           return AARCH64_PARSE_OK;
8751         }
8752     }
8753
8754   /* CPU name not found in list.  */
8755   return AARCH64_PARSE_INVALID_ARG;
8756 }
8757
8758 /* Parse TOKEN, which has length LENGTH to see if it is an option
8759    described in FLAG.  If it is, return the index bit for that fusion type.
8760    If not, error (printing OPTION_NAME) and return zero.  */
8761
8762 static unsigned int
8763 aarch64_parse_one_option_token (const char *token,
8764                                 size_t length,
8765                                 const struct aarch64_flag_desc *flag,
8766                                 const char *option_name)
8767 {
8768   for (; flag->name != NULL; flag++)
8769     {
8770       if (length == strlen (flag->name)
8771           && !strncmp (flag->name, token, length))
8772         return flag->flag;
8773     }
8774
8775   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8776   return 0;
8777 }
8778
8779 /* Parse OPTION which is a comma-separated list of flags to enable.
8780    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8781    default state we inherit from the CPU tuning structures.  OPTION_NAME
8782    gives the top-level option we are parsing in the -moverride string,
8783    for use in error messages.  */
8784
8785 static unsigned int
8786 aarch64_parse_boolean_options (const char *option,
8787                                const struct aarch64_flag_desc *flags,
8788                                unsigned int initial_state,
8789                                const char *option_name)
8790 {
8791   const char separator = '.';
8792   const char* specs = option;
8793   const char* ntoken = option;
8794   unsigned int found_flags = initial_state;
8795
8796   while ((ntoken = strchr (specs, separator)))
8797     {
8798       size_t token_length = ntoken - specs;
8799       unsigned token_ops = aarch64_parse_one_option_token (specs,
8800                                                            token_length,
8801                                                            flags,
8802                                                            option_name);
8803       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8804          in the token stream, reset the supported operations.  So:
8805
8806            adrp+add.cmp+branch.none.adrp+add
8807
8808            would have the result of turning on only adrp+add fusion.  */
8809       if (!token_ops)
8810         found_flags = 0;
8811
8812       found_flags |= token_ops;
8813       specs = ++ntoken;
8814     }
8815
8816   /* We ended with a comma, print something.  */
8817   if (!(*specs))
8818     {
8819       error ("%s string ill-formed\n", option_name);
8820       return 0;
8821     }
8822
8823   /* We still have one more token to parse.  */
8824   size_t token_length = strlen (specs);
8825   unsigned token_ops = aarch64_parse_one_option_token (specs,
8826                                                        token_length,
8827                                                        flags,
8828                                                        option_name);
8829    if (!token_ops)
8830      found_flags = 0;
8831
8832   found_flags |= token_ops;
8833   return found_flags;
8834 }
8835
8836 /* Support for overriding instruction fusion.  */
8837
8838 static void
8839 aarch64_parse_fuse_string (const char *fuse_string,
8840                             struct tune_params *tune)
8841 {
8842   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8843                                                      aarch64_fusible_pairs,
8844                                                      tune->fusible_ops,
8845                                                      "fuse=");
8846 }
8847
8848 /* Support for overriding other tuning flags.  */
8849
8850 static void
8851 aarch64_parse_tune_string (const char *tune_string,
8852                             struct tune_params *tune)
8853 {
8854   tune->extra_tuning_flags
8855     = aarch64_parse_boolean_options (tune_string,
8856                                      aarch64_tuning_flags,
8857                                      tune->extra_tuning_flags,
8858                                      "tune=");
8859 }
8860
8861 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8862    we understand.  If it is, extract the option string and handoff to
8863    the appropriate function.  */
8864
8865 void
8866 aarch64_parse_one_override_token (const char* token,
8867                                   size_t length,
8868                                   struct tune_params *tune)
8869 {
8870   const struct aarch64_tuning_override_function *fn
8871     = aarch64_tuning_override_functions;
8872
8873   const char *option_part = strchr (token, '=');
8874   if (!option_part)
8875     {
8876       error ("tuning string missing in option (%s)", token);
8877       return;
8878     }
8879
8880   /* Get the length of the option name.  */
8881   length = option_part - token;
8882   /* Skip the '=' to get to the option string.  */
8883   option_part++;
8884
8885   for (; fn->name != NULL; fn++)
8886     {
8887       if (!strncmp (fn->name, token, length))
8888         {
8889           fn->parse_override (option_part, tune);
8890           return;
8891         }
8892     }
8893
8894   error ("unknown tuning option (%s)",token);
8895   return;
8896 }
8897
8898 /* A checking mechanism for the implementation of the tls size.  */
8899
8900 static void
8901 initialize_aarch64_tls_size (struct gcc_options *opts)
8902 {
8903   if (aarch64_tls_size == 0)
8904     aarch64_tls_size = 24;
8905
8906   switch (opts->x_aarch64_cmodel_var)
8907     {
8908     case AARCH64_CMODEL_TINY:
8909       /* Both the default and maximum TLS size allowed under tiny is 1M which
8910          needs two instructions to address, so we clamp the size to 24.  */
8911       if (aarch64_tls_size > 24)
8912         aarch64_tls_size = 24;
8913       break;
8914     case AARCH64_CMODEL_SMALL:
8915       /* The maximum TLS size allowed under small is 4G.  */
8916       if (aarch64_tls_size > 32)
8917         aarch64_tls_size = 32;
8918       break;
8919     case AARCH64_CMODEL_LARGE:
8920       /* The maximum TLS size allowed under large is 16E.
8921          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8922       if (aarch64_tls_size > 48)
8923         aarch64_tls_size = 48;
8924       break;
8925     default:
8926       gcc_unreachable ();
8927     }
8928
8929   return;
8930 }
8931
8932 /* Parse STRING looking for options in the format:
8933      string     :: option:string
8934      option     :: name=substring
8935      name       :: {a-z}
8936      substring  :: defined by option.  */
8937
8938 static void
8939 aarch64_parse_override_string (const char* input_string,
8940                                struct tune_params* tune)
8941 {
8942   const char separator = ':';
8943   size_t string_length = strlen (input_string) + 1;
8944   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8945   char *string = string_root;
8946   strncpy (string, input_string, string_length);
8947   string[string_length - 1] = '\0';
8948
8949   char* ntoken = string;
8950
8951   while ((ntoken = strchr (string, separator)))
8952     {
8953       size_t token_length = ntoken - string;
8954       /* Make this substring look like a string.  */
8955       *ntoken = '\0';
8956       aarch64_parse_one_override_token (string, token_length, tune);
8957       string = ++ntoken;
8958     }
8959
8960   /* One last option to parse.  */
8961   aarch64_parse_one_override_token (string, strlen (string), tune);
8962   free (string_root);
8963 }
8964
8965
8966 static void
8967 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8968 {
8969   /* PR 70044: We have to be careful about being called multiple times for the
8970      same function.  This means all changes should be repeatable.  */
8971
8972   /* If the frame pointer is enabled, set it to a special value that behaves
8973      similar to frame pointer omission.  If we don't do this all leaf functions
8974      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
8975      If flag_omit_frame_pointer has this special value, we must force the
8976      frame pointer if not in a leaf function.  We also need to force it in a
8977      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
8978   if (opts->x_flag_omit_frame_pointer == 0)
8979     opts->x_flag_omit_frame_pointer = 2;
8980
8981   /* If not optimizing for size, set the default
8982      alignment to what the target wants.  */
8983   if (!opts->x_optimize_size)
8984     {
8985       if (opts->x_align_loops <= 0)
8986         opts->x_align_loops = aarch64_tune_params.loop_align;
8987       if (opts->x_align_jumps <= 0)
8988         opts->x_align_jumps = aarch64_tune_params.jump_align;
8989       if (opts->x_align_functions <= 0)
8990         opts->x_align_functions = aarch64_tune_params.function_align;
8991     }
8992
8993   /* We default to no pc-relative literal loads.  */
8994
8995   aarch64_pcrelative_literal_loads = false;
8996
8997   /* If -mpc-relative-literal-loads is set on the command line, this
8998      implies that the user asked for PC relative literal loads.  */
8999   if (opts->x_pcrelative_literal_loads == 1)
9000     aarch64_pcrelative_literal_loads = true;
9001
9002   /* In the tiny memory model it makes no sense to disallow PC relative
9003      literal pool loads.  */
9004   if (aarch64_cmodel == AARCH64_CMODEL_TINY
9005       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9006     aarch64_pcrelative_literal_loads = true;
9007
9008   /* When enabling the lower precision Newton series for the square root, also
9009      enable it for the reciprocal square root, since the latter is an
9010      intermediary step for the former.  */
9011   if (flag_mlow_precision_sqrt)
9012     flag_mrecip_low_precision_sqrt = true;
9013 }
9014
9015 /* 'Unpack' up the internal tuning structs and update the options
9016     in OPTS.  The caller must have set up selected_tune and selected_arch
9017     as all the other target-specific codegen decisions are
9018     derived from them.  */
9019
9020 void
9021 aarch64_override_options_internal (struct gcc_options *opts)
9022 {
9023   aarch64_tune_flags = selected_tune->flags;
9024   aarch64_tune = selected_tune->sched_core;
9025   /* Make a copy of the tuning parameters attached to the core, which
9026      we may later overwrite.  */
9027   aarch64_tune_params = *(selected_tune->tune);
9028   aarch64_architecture_version = selected_arch->architecture_version;
9029
9030   if (opts->x_aarch64_override_tune_string)
9031     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9032                                   &aarch64_tune_params);
9033
9034   /* This target defaults to strict volatile bitfields.  */
9035   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9036     opts->x_flag_strict_volatile_bitfields = 1;
9037
9038   initialize_aarch64_code_model (opts);
9039   initialize_aarch64_tls_size (opts);
9040
9041   int queue_depth = 0;
9042   switch (aarch64_tune_params.autoprefetcher_model)
9043     {
9044       case tune_params::AUTOPREFETCHER_OFF:
9045         queue_depth = -1;
9046         break;
9047       case tune_params::AUTOPREFETCHER_WEAK:
9048         queue_depth = 0;
9049         break;
9050       case tune_params::AUTOPREFETCHER_STRONG:
9051         queue_depth = max_insn_queue_index + 1;
9052         break;
9053       default:
9054         gcc_unreachable ();
9055     }
9056
9057   /* We don't mind passing in global_options_set here as we don't use
9058      the *options_set structs anyway.  */
9059   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9060                          queue_depth,
9061                          opts->x_param_values,
9062                          global_options_set.x_param_values);
9063
9064   /* Set up parameters to be used in prefetching algorithm.  Do not
9065      override the defaults unless we are tuning for a core we have
9066      researched values for.  */
9067   if (aarch64_tune_params.prefetch->num_slots > 0)
9068     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9069                            aarch64_tune_params.prefetch->num_slots,
9070                            opts->x_param_values,
9071                            global_options_set.x_param_values);
9072   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9073     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9074                            aarch64_tune_params.prefetch->l1_cache_size,
9075                            opts->x_param_values,
9076                            global_options_set.x_param_values);
9077   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9078     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9079                            aarch64_tune_params.prefetch->l1_cache_line_size,
9080                            opts->x_param_values,
9081                            global_options_set.x_param_values);
9082   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9083     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9084                            aarch64_tune_params.prefetch->l2_cache_size,
9085                            opts->x_param_values,
9086                            global_options_set.x_param_values);
9087
9088   /* Enable sw prefetching at specified optimization level for
9089      CPUS that have prefetch.  Lower optimization level threshold by 1
9090      when profiling is enabled.  */
9091   if (opts->x_flag_prefetch_loop_arrays < 0
9092       && !opts->x_optimize_size
9093       && aarch64_tune_params.prefetch->default_opt_level >= 0
9094       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9095     opts->x_flag_prefetch_loop_arrays = 1;
9096
9097   aarch64_override_options_after_change_1 (opts);
9098 }
9099
9100 /* Print a hint with a suggestion for a core or architecture name that
9101    most closely resembles what the user passed in STR.  ARCH is true if
9102    the user is asking for an architecture name.  ARCH is false if the user
9103    is asking for a core name.  */
9104
9105 static void
9106 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9107 {
9108   auto_vec<const char *> candidates;
9109   const struct processor *entry = arch ? all_architectures : all_cores;
9110   for (; entry->name != NULL; entry++)
9111     candidates.safe_push (entry->name);
9112   char *s;
9113   const char *hint = candidates_list_and_hint (str, s, candidates);
9114   if (hint)
9115     inform (input_location, "valid arguments are: %s;"
9116                              " did you mean %qs?", s, hint);
9117   XDELETEVEC (s);
9118 }
9119
9120 /* Print a hint with a suggestion for a core name that most closely resembles
9121    what the user passed in STR.  */
9122
9123 inline static void
9124 aarch64_print_hint_for_core (const char *str)
9125 {
9126   aarch64_print_hint_for_core_or_arch (str, false);
9127 }
9128
9129 /* Print a hint with a suggestion for an architecture name that most closely
9130    resembles what the user passed in STR.  */
9131
9132 inline static void
9133 aarch64_print_hint_for_arch (const char *str)
9134 {
9135   aarch64_print_hint_for_core_or_arch (str, true);
9136 }
9137
9138 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9139    specified in STR and throw errors if appropriate.  Put the results if
9140    they are valid in RES and ISA_FLAGS.  Return whether the option is
9141    valid.  */
9142
9143 static bool
9144 aarch64_validate_mcpu (const char *str, const struct processor **res,
9145                        unsigned long *isa_flags)
9146 {
9147   enum aarch64_parse_opt_result parse_res
9148     = aarch64_parse_cpu (str, res, isa_flags);
9149
9150   if (parse_res == AARCH64_PARSE_OK)
9151     return true;
9152
9153   switch (parse_res)
9154     {
9155       case AARCH64_PARSE_MISSING_ARG:
9156         error ("missing cpu name in %<-mcpu=%s%>", str);
9157         break;
9158       case AARCH64_PARSE_INVALID_ARG:
9159         error ("unknown value %qs for -mcpu", str);
9160         aarch64_print_hint_for_core (str);
9161         break;
9162       case AARCH64_PARSE_INVALID_FEATURE:
9163         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9164         break;
9165       default:
9166         gcc_unreachable ();
9167     }
9168
9169   return false;
9170 }
9171
9172 /* Validate a command-line -march option.  Parse the arch and extensions
9173    (if any) specified in STR and throw errors if appropriate.  Put the
9174    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9175    option is valid.  */
9176
9177 static bool
9178 aarch64_validate_march (const char *str, const struct processor **res,
9179                          unsigned long *isa_flags)
9180 {
9181   enum aarch64_parse_opt_result parse_res
9182     = aarch64_parse_arch (str, res, isa_flags);
9183
9184   if (parse_res == AARCH64_PARSE_OK)
9185     return true;
9186
9187   switch (parse_res)
9188     {
9189       case AARCH64_PARSE_MISSING_ARG:
9190         error ("missing arch name in %<-march=%s%>", str);
9191         break;
9192       case AARCH64_PARSE_INVALID_ARG:
9193         error ("unknown value %qs for -march", str);
9194         aarch64_print_hint_for_arch (str);
9195         break;
9196       case AARCH64_PARSE_INVALID_FEATURE:
9197         error ("invalid feature modifier in %<-march=%s%>", str);
9198         break;
9199       default:
9200         gcc_unreachable ();
9201     }
9202
9203   return false;
9204 }
9205
9206 /* Validate a command-line -mtune option.  Parse the cpu
9207    specified in STR and throw errors if appropriate.  Put the
9208    result, if it is valid, in RES.  Return whether the option is
9209    valid.  */
9210
9211 static bool
9212 aarch64_validate_mtune (const char *str, const struct processor **res)
9213 {
9214   enum aarch64_parse_opt_result parse_res
9215     = aarch64_parse_tune (str, res);
9216
9217   if (parse_res == AARCH64_PARSE_OK)
9218     return true;
9219
9220   switch (parse_res)
9221     {
9222       case AARCH64_PARSE_MISSING_ARG:
9223         error ("missing cpu name in %<-mtune=%s%>", str);
9224         break;
9225       case AARCH64_PARSE_INVALID_ARG:
9226         error ("unknown value %qs for -mtune", str);
9227         aarch64_print_hint_for_core (str);
9228         break;
9229       default:
9230         gcc_unreachable ();
9231     }
9232   return false;
9233 }
9234
9235 /* Return the CPU corresponding to the enum CPU.
9236    If it doesn't specify a cpu, return the default.  */
9237
9238 static const struct processor *
9239 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9240 {
9241   if (cpu != aarch64_none)
9242     return &all_cores[cpu];
9243
9244   /* The & 0x3f is to extract the bottom 6 bits that encode the
9245      default cpu as selected by the --with-cpu GCC configure option
9246      in config.gcc.
9247      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9248      flags mechanism should be reworked to make it more sane.  */
9249   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9250 }
9251
9252 /* Return the architecture corresponding to the enum ARCH.
9253    If it doesn't specify a valid architecture, return the default.  */
9254
9255 static const struct processor *
9256 aarch64_get_arch (enum aarch64_arch arch)
9257 {
9258   if (arch != aarch64_no_arch)
9259     return &all_architectures[arch];
9260
9261   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9262
9263   return &all_architectures[cpu->arch];
9264 }
9265
9266 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9267    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9268    tuning structs.  In particular it must set selected_tune and
9269    aarch64_isa_flags that define the available ISA features and tuning
9270    decisions.  It must also set selected_arch as this will be used to
9271    output the .arch asm tags for each function.  */
9272
9273 static void
9274 aarch64_override_options (void)
9275 {
9276   unsigned long cpu_isa = 0;
9277   unsigned long arch_isa = 0;
9278   aarch64_isa_flags = 0;
9279
9280   bool valid_cpu = true;
9281   bool valid_tune = true;
9282   bool valid_arch = true;
9283
9284   selected_cpu = NULL;
9285   selected_arch = NULL;
9286   selected_tune = NULL;
9287
9288   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9289      If either of -march or -mtune is given, they override their
9290      respective component of -mcpu.  */
9291   if (aarch64_cpu_string)
9292     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9293                                         &cpu_isa);
9294
9295   if (aarch64_arch_string)
9296     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9297                                           &arch_isa);
9298
9299   if (aarch64_tune_string)
9300     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9301
9302   /* If the user did not specify a processor, choose the default
9303      one for them.  This will be the CPU set during configuration using
9304      --with-cpu, otherwise it is "generic".  */
9305   if (!selected_cpu)
9306     {
9307       if (selected_arch)
9308         {
9309           selected_cpu = &all_cores[selected_arch->ident];
9310           aarch64_isa_flags = arch_isa;
9311           explicit_arch = selected_arch->arch;
9312         }
9313       else
9314         {
9315           /* Get default configure-time CPU.  */
9316           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9317           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9318         }
9319
9320       if (selected_tune)
9321         explicit_tune_core = selected_tune->ident;
9322     }
9323   /* If both -mcpu and -march are specified check that they are architecturally
9324      compatible, warn if they're not and prefer the -march ISA flags.  */
9325   else if (selected_arch)
9326     {
9327       if (selected_arch->arch != selected_cpu->arch)
9328         {
9329           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9330                        all_architectures[selected_cpu->arch].name,
9331                        selected_arch->name);
9332         }
9333       aarch64_isa_flags = arch_isa;
9334       explicit_arch = selected_arch->arch;
9335       explicit_tune_core = selected_tune ? selected_tune->ident
9336                                           : selected_cpu->ident;
9337     }
9338   else
9339     {
9340       /* -mcpu but no -march.  */
9341       aarch64_isa_flags = cpu_isa;
9342       explicit_tune_core = selected_tune ? selected_tune->ident
9343                                           : selected_cpu->ident;
9344       gcc_assert (selected_cpu);
9345       selected_arch = &all_architectures[selected_cpu->arch];
9346       explicit_arch = selected_arch->arch;
9347     }
9348
9349   /* Set the arch as well as we will need it when outputing
9350      the .arch directive in assembly.  */
9351   if (!selected_arch)
9352     {
9353       gcc_assert (selected_cpu);
9354       selected_arch = &all_architectures[selected_cpu->arch];
9355     }
9356
9357   if (!selected_tune)
9358     selected_tune = selected_cpu;
9359
9360 #ifndef HAVE_AS_MABI_OPTION
9361   /* The compiler may have been configured with 2.23.* binutils, which does
9362      not have support for ILP32.  */
9363   if (TARGET_ILP32)
9364     error ("Assembler does not support -mabi=ilp32");
9365 #endif
9366
9367   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9368     sorry ("Return address signing is only supported for -mabi=lp64");
9369
9370   /* Make sure we properly set up the explicit options.  */
9371   if ((aarch64_cpu_string && valid_cpu)
9372        || (aarch64_tune_string && valid_tune))
9373     gcc_assert (explicit_tune_core != aarch64_none);
9374
9375   if ((aarch64_cpu_string && valid_cpu)
9376        || (aarch64_arch_string && valid_arch))
9377     gcc_assert (explicit_arch != aarch64_no_arch);
9378
9379   aarch64_override_options_internal (&global_options);
9380
9381   /* Save these options as the default ones in case we push and pop them later
9382      while processing functions with potential target attributes.  */
9383   target_option_default_node = target_option_current_node
9384       = build_target_option_node (&global_options);
9385 }
9386
9387 /* Implement targetm.override_options_after_change.  */
9388
9389 static void
9390 aarch64_override_options_after_change (void)
9391 {
9392   aarch64_override_options_after_change_1 (&global_options);
9393 }
9394
9395 static struct machine_function *
9396 aarch64_init_machine_status (void)
9397 {
9398   struct machine_function *machine;
9399   machine = ggc_cleared_alloc<machine_function> ();
9400   return machine;
9401 }
9402
9403 void
9404 aarch64_init_expanders (void)
9405 {
9406   init_machine_status = aarch64_init_machine_status;
9407 }
9408
9409 /* A checking mechanism for the implementation of the various code models.  */
9410 static void
9411 initialize_aarch64_code_model (struct gcc_options *opts)
9412 {
9413    if (opts->x_flag_pic)
9414      {
9415        switch (opts->x_aarch64_cmodel_var)
9416          {
9417          case AARCH64_CMODEL_TINY:
9418            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9419            break;
9420          case AARCH64_CMODEL_SMALL:
9421 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9422            aarch64_cmodel = (flag_pic == 2
9423                              ? AARCH64_CMODEL_SMALL_PIC
9424                              : AARCH64_CMODEL_SMALL_SPIC);
9425 #else
9426            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9427 #endif
9428            break;
9429          case AARCH64_CMODEL_LARGE:
9430            sorry ("code model %qs with -f%s", "large",
9431                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9432            break;
9433          default:
9434            gcc_unreachable ();
9435          }
9436      }
9437    else
9438      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9439 }
9440
9441 /* Implement TARGET_OPTION_SAVE.  */
9442
9443 static void
9444 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9445 {
9446   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9447 }
9448
9449 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9450    using the information saved in PTR.  */
9451
9452 static void
9453 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9454 {
9455   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9456   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9457   opts->x_explicit_arch = ptr->x_explicit_arch;
9458   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9459   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9460
9461   aarch64_override_options_internal (opts);
9462 }
9463
9464 /* Implement TARGET_OPTION_PRINT.  */
9465
9466 static void
9467 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9468 {
9469   const struct processor *cpu
9470     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9471   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9472   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9473   std::string extension
9474     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9475
9476   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9477   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9478            arch->name, extension.c_str ());
9479 }
9480
9481 static GTY(()) tree aarch64_previous_fndecl;
9482
9483 void
9484 aarch64_reset_previous_fndecl (void)
9485 {
9486   aarch64_previous_fndecl = NULL;
9487 }
9488
9489 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9490    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9491    make sure optab availability predicates are recomputed when necessary.  */
9492
9493 void
9494 aarch64_save_restore_target_globals (tree new_tree)
9495 {
9496   if (TREE_TARGET_GLOBALS (new_tree))
9497     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9498   else if (new_tree == target_option_default_node)
9499     restore_target_globals (&default_target_globals);
9500   else
9501     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9502 }
9503
9504 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9505    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9506    of the function, if such exists.  This function may be called multiple
9507    times on a single function so use aarch64_previous_fndecl to avoid
9508    setting up identical state.  */
9509
9510 static void
9511 aarch64_set_current_function (tree fndecl)
9512 {
9513   if (!fndecl || fndecl == aarch64_previous_fndecl)
9514     return;
9515
9516   tree old_tree = (aarch64_previous_fndecl
9517                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9518                    : NULL_TREE);
9519
9520   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9521
9522   /* If current function has no attributes but the previous one did,
9523      use the default node.  */
9524   if (!new_tree && old_tree)
9525     new_tree = target_option_default_node;
9526
9527   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9528      the default have been handled by aarch64_save_restore_target_globals from
9529      aarch64_pragma_target_parse.  */
9530   if (old_tree == new_tree)
9531     return;
9532
9533   aarch64_previous_fndecl = fndecl;
9534
9535   /* First set the target options.  */
9536   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9537
9538   aarch64_save_restore_target_globals (new_tree);
9539 }
9540
9541 /* Enum describing the various ways we can handle attributes.
9542    In many cases we can reuse the generic option handling machinery.  */
9543
9544 enum aarch64_attr_opt_type
9545 {
9546   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9547   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9548   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9549   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9550 };
9551
9552 /* All the information needed to handle a target attribute.
9553    NAME is the name of the attribute.
9554    ATTR_TYPE specifies the type of behavior of the attribute as described
9555    in the definition of enum aarch64_attr_opt_type.
9556    ALLOW_NEG is true if the attribute supports a "no-" form.
9557    HANDLER is the function that takes the attribute string and whether
9558    it is a pragma or attribute and handles the option.  It is needed only
9559    when the ATTR_TYPE is aarch64_attr_custom.
9560    OPT_NUM is the enum specifying the option that the attribute modifies.
9561    This is needed for attributes that mirror the behavior of a command-line
9562    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9563    aarch64_attr_enum.  */
9564
9565 struct aarch64_attribute_info
9566 {
9567   const char *name;
9568   enum aarch64_attr_opt_type attr_type;
9569   bool allow_neg;
9570   bool (*handler) (const char *, const char *);
9571   enum opt_code opt_num;
9572 };
9573
9574 /* Handle the ARCH_STR argument to the arch= target attribute.
9575    PRAGMA_OR_ATTR is used in potential error messages.  */
9576
9577 static bool
9578 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9579 {
9580   const struct processor *tmp_arch = NULL;
9581   enum aarch64_parse_opt_result parse_res
9582     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9583
9584   if (parse_res == AARCH64_PARSE_OK)
9585     {
9586       gcc_assert (tmp_arch);
9587       selected_arch = tmp_arch;
9588       explicit_arch = selected_arch->arch;
9589       return true;
9590     }
9591
9592   switch (parse_res)
9593     {
9594       case AARCH64_PARSE_MISSING_ARG:
9595         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9596         break;
9597       case AARCH64_PARSE_INVALID_ARG:
9598         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9599         aarch64_print_hint_for_arch (str);
9600         break;
9601       case AARCH64_PARSE_INVALID_FEATURE:
9602         error ("invalid feature modifier %qs for 'arch' target %s",
9603                str, pragma_or_attr);
9604         break;
9605       default:
9606         gcc_unreachable ();
9607     }
9608
9609   return false;
9610 }
9611
9612 /* Handle the argument CPU_STR to the cpu= target attribute.
9613    PRAGMA_OR_ATTR is used in potential error messages.  */
9614
9615 static bool
9616 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9617 {
9618   const struct processor *tmp_cpu = NULL;
9619   enum aarch64_parse_opt_result parse_res
9620     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9621
9622   if (parse_res == AARCH64_PARSE_OK)
9623     {
9624       gcc_assert (tmp_cpu);
9625       selected_tune = tmp_cpu;
9626       explicit_tune_core = selected_tune->ident;
9627
9628       selected_arch = &all_architectures[tmp_cpu->arch];
9629       explicit_arch = selected_arch->arch;
9630       return true;
9631     }
9632
9633   switch (parse_res)
9634     {
9635       case AARCH64_PARSE_MISSING_ARG:
9636         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9637         break;
9638       case AARCH64_PARSE_INVALID_ARG:
9639         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9640         aarch64_print_hint_for_core (str);
9641         break;
9642       case AARCH64_PARSE_INVALID_FEATURE:
9643         error ("invalid feature modifier %qs for 'cpu' target %s",
9644                str, pragma_or_attr);
9645         break;
9646       default:
9647         gcc_unreachable ();
9648     }
9649
9650   return false;
9651 }
9652
9653 /* Handle the argument STR to the tune= target attribute.
9654    PRAGMA_OR_ATTR is used in potential error messages.  */
9655
9656 static bool
9657 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9658 {
9659   const struct processor *tmp_tune = NULL;
9660   enum aarch64_parse_opt_result parse_res
9661     = aarch64_parse_tune (str, &tmp_tune);
9662
9663   if (parse_res == AARCH64_PARSE_OK)
9664     {
9665       gcc_assert (tmp_tune);
9666       selected_tune = tmp_tune;
9667       explicit_tune_core = selected_tune->ident;
9668       return true;
9669     }
9670
9671   switch (parse_res)
9672     {
9673       case AARCH64_PARSE_INVALID_ARG:
9674         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9675         aarch64_print_hint_for_core (str);
9676         break;
9677       default:
9678         gcc_unreachable ();
9679     }
9680
9681   return false;
9682 }
9683
9684 /* Parse an architecture extensions target attribute string specified in STR.
9685    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9686    if successful.  Update aarch64_isa_flags to reflect the ISA features
9687    modified.
9688    PRAGMA_OR_ATTR is used in potential error messages.  */
9689
9690 static bool
9691 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9692 {
9693   enum aarch64_parse_opt_result parse_res;
9694   unsigned long isa_flags = aarch64_isa_flags;
9695
9696   /* We allow "+nothing" in the beginning to clear out all architectural
9697      features if the user wants to handpick specific features.  */
9698   if (strncmp ("+nothing", str, 8) == 0)
9699     {
9700       isa_flags = 0;
9701       str += 8;
9702     }
9703
9704   parse_res = aarch64_parse_extension (str, &isa_flags);
9705
9706   if (parse_res == AARCH64_PARSE_OK)
9707     {
9708       aarch64_isa_flags = isa_flags;
9709       return true;
9710     }
9711
9712   switch (parse_res)
9713     {
9714       case AARCH64_PARSE_MISSING_ARG:
9715         error ("missing feature modifier in target %s %qs",
9716                pragma_or_attr, str);
9717         break;
9718
9719       case AARCH64_PARSE_INVALID_FEATURE:
9720         error ("invalid feature modifier in target %s %qs",
9721                pragma_or_attr, str);
9722         break;
9723
9724       default:
9725         gcc_unreachable ();
9726     }
9727
9728  return false;
9729 }
9730
9731 /* The target attributes that we support.  On top of these we also support just
9732    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9733    handled explicitly in aarch64_process_one_target_attr.  */
9734
9735 static const struct aarch64_attribute_info aarch64_attributes[] =
9736 {
9737   { "general-regs-only", aarch64_attr_mask, false, NULL,
9738      OPT_mgeneral_regs_only },
9739   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9740      OPT_mfix_cortex_a53_835769 },
9741   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9742      OPT_mfix_cortex_a53_843419 },
9743   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9744   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9745   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9746      OPT_momit_leaf_frame_pointer },
9747   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9748   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9749      OPT_march_ },
9750   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9751   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9752      OPT_mtune_ },
9753   { "sign-return-address", aarch64_attr_enum, false, NULL,
9754      OPT_msign_return_address_ },
9755   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9756 };
9757
9758 /* Parse ARG_STR which contains the definition of one target attribute.
9759    Show appropriate errors if any or return true if the attribute is valid.
9760    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9761    we're processing a target attribute or pragma.  */
9762
9763 static bool
9764 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9765 {
9766   bool invert = false;
9767
9768   size_t len = strlen (arg_str);
9769
9770   if (len == 0)
9771     {
9772       error ("malformed target %s", pragma_or_attr);
9773       return false;
9774     }
9775
9776   char *str_to_check = (char *) alloca (len + 1);
9777   strcpy (str_to_check, arg_str);
9778
9779   /* Skip leading whitespace.  */
9780   while (*str_to_check == ' ' || *str_to_check == '\t')
9781     str_to_check++;
9782
9783   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9784      It is easier to detect and handle it explicitly here rather than going
9785      through the machinery for the rest of the target attributes in this
9786      function.  */
9787   if (*str_to_check == '+')
9788     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9789
9790   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9791     {
9792       invert = true;
9793       str_to_check += 3;
9794     }
9795   char *arg = strchr (str_to_check, '=');
9796
9797   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9798      and point ARG to "foo".  */
9799   if (arg)
9800     {
9801       *arg = '\0';
9802       arg++;
9803     }
9804   const struct aarch64_attribute_info *p_attr;
9805   bool found = false;
9806   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9807     {
9808       /* If the names don't match up, or the user has given an argument
9809          to an attribute that doesn't accept one, or didn't give an argument
9810          to an attribute that expects one, fail to match.  */
9811       if (strcmp (str_to_check, p_attr->name) != 0)
9812         continue;
9813
9814       found = true;
9815       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9816                               || p_attr->attr_type == aarch64_attr_enum;
9817
9818       if (attr_need_arg_p ^ (arg != NULL))
9819         {
9820           error ("target %s %qs does not accept an argument",
9821                   pragma_or_attr, str_to_check);
9822           return false;
9823         }
9824
9825       /* If the name matches but the attribute does not allow "no-" versions
9826          then we can't match.  */
9827       if (invert && !p_attr->allow_neg)
9828         {
9829           error ("target %s %qs does not allow a negated form",
9830                   pragma_or_attr, str_to_check);
9831           return false;
9832         }
9833
9834       switch (p_attr->attr_type)
9835         {
9836         /* Has a custom handler registered.
9837            For example, cpu=, arch=, tune=.  */
9838           case aarch64_attr_custom:
9839             gcc_assert (p_attr->handler);
9840             if (!p_attr->handler (arg, pragma_or_attr))
9841               return false;
9842             break;
9843
9844           /* Either set or unset a boolean option.  */
9845           case aarch64_attr_bool:
9846             {
9847               struct cl_decoded_option decoded;
9848
9849               generate_option (p_attr->opt_num, NULL, !invert,
9850                                CL_TARGET, &decoded);
9851               aarch64_handle_option (&global_options, &global_options_set,
9852                                       &decoded, input_location);
9853               break;
9854             }
9855           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9856              should know what mask to apply given the option number.  */
9857           case aarch64_attr_mask:
9858             {
9859               struct cl_decoded_option decoded;
9860               /* We only need to specify the option number.
9861                  aarch64_handle_option will know which mask to apply.  */
9862               decoded.opt_index = p_attr->opt_num;
9863               decoded.value = !invert;
9864               aarch64_handle_option (&global_options, &global_options_set,
9865                                       &decoded, input_location);
9866               break;
9867             }
9868           /* Use the option setting machinery to set an option to an enum.  */
9869           case aarch64_attr_enum:
9870             {
9871               gcc_assert (arg);
9872               bool valid;
9873               int value;
9874               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9875                                               &value, CL_TARGET);
9876               if (valid)
9877                 {
9878                   set_option (&global_options, NULL, p_attr->opt_num, value,
9879                               NULL, DK_UNSPECIFIED, input_location,
9880                               global_dc);
9881                 }
9882               else
9883                 {
9884                   error ("target %s %s=%s is not valid",
9885                          pragma_or_attr, str_to_check, arg);
9886                 }
9887               break;
9888             }
9889           default:
9890             gcc_unreachable ();
9891         }
9892     }
9893
9894   /* If we reached here we either have found an attribute and validated
9895      it or didn't match any.  If we matched an attribute but its arguments
9896      were malformed we will have returned false already.  */
9897   return found;
9898 }
9899
9900 /* Count how many times the character C appears in
9901    NULL-terminated string STR.  */
9902
9903 static unsigned int
9904 num_occurences_in_str (char c, char *str)
9905 {
9906   unsigned int res = 0;
9907   while (*str != '\0')
9908     {
9909       if (*str == c)
9910         res++;
9911
9912       str++;
9913     }
9914
9915   return res;
9916 }
9917
9918 /* Parse the tree in ARGS that contains the target attribute information
9919    and update the global target options space.  PRAGMA_OR_ATTR is a string
9920    to be used in error messages, specifying whether this is processing
9921    a target attribute or a target pragma.  */
9922
9923 bool
9924 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9925 {
9926   if (TREE_CODE (args) == TREE_LIST)
9927     {
9928       do
9929         {
9930           tree head = TREE_VALUE (args);
9931           if (head)
9932             {
9933               if (!aarch64_process_target_attr (head, pragma_or_attr))
9934                 return false;
9935             }
9936           args = TREE_CHAIN (args);
9937         } while (args);
9938
9939       return true;
9940     }
9941
9942   if (TREE_CODE (args) != STRING_CST)
9943     {
9944       error ("attribute %<target%> argument not a string");
9945       return false;
9946     }
9947
9948   size_t len = strlen (TREE_STRING_POINTER (args));
9949   char *str_to_check = (char *) alloca (len + 1);
9950   strcpy (str_to_check, TREE_STRING_POINTER (args));
9951
9952   if (len == 0)
9953     {
9954       error ("malformed target %s value", pragma_or_attr);
9955       return false;
9956     }
9957
9958   /* Used to catch empty spaces between commas i.e.
9959      attribute ((target ("attr1,,attr2"))).  */
9960   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9961
9962   /* Handle multiple target attributes separated by ','.  */
9963   char *token = strtok (str_to_check, ",");
9964
9965   unsigned int num_attrs = 0;
9966   while (token)
9967     {
9968       num_attrs++;
9969       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9970         {
9971           error ("target %s %qs is invalid", pragma_or_attr, token);
9972           return false;
9973         }
9974
9975       token = strtok (NULL, ",");
9976     }
9977
9978   if (num_attrs != num_commas + 1)
9979     {
9980       error ("malformed target %s list %qs",
9981               pragma_or_attr, TREE_STRING_POINTER (args));
9982       return false;
9983     }
9984
9985   return true;
9986 }
9987
9988 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9989    process attribute ((target ("..."))).  */
9990
9991 static bool
9992 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9993 {
9994   struct cl_target_option cur_target;
9995   bool ret;
9996   tree old_optimize;
9997   tree new_target, new_optimize;
9998   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9999
10000   /* If what we're processing is the current pragma string then the
10001      target option node is already stored in target_option_current_node
10002      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
10003      having to re-parse the string.  This is especially useful to keep
10004      arm_neon.h compile times down since that header contains a lot
10005      of intrinsics enclosed in pragmas.  */
10006   if (!existing_target && args == current_target_pragma)
10007     {
10008       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
10009       return true;
10010     }
10011   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10012
10013   old_optimize = build_optimization_node (&global_options);
10014   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10015
10016   /* If the function changed the optimization levels as well as setting
10017      target options, start with the optimizations specified.  */
10018   if (func_optimize && func_optimize != old_optimize)
10019     cl_optimization_restore (&global_options,
10020                              TREE_OPTIMIZATION (func_optimize));
10021
10022   /* Save the current target options to restore at the end.  */
10023   cl_target_option_save (&cur_target, &global_options);
10024
10025   /* If fndecl already has some target attributes applied to it, unpack
10026      them so that we add this attribute on top of them, rather than
10027      overwriting them.  */
10028   if (existing_target)
10029     {
10030       struct cl_target_option *existing_options
10031         = TREE_TARGET_OPTION (existing_target);
10032
10033       if (existing_options)
10034         cl_target_option_restore (&global_options, existing_options);
10035     }
10036   else
10037     cl_target_option_restore (&global_options,
10038                         TREE_TARGET_OPTION (target_option_current_node));
10039
10040
10041   ret = aarch64_process_target_attr (args, "attribute");
10042
10043   /* Set up any additional state.  */
10044   if (ret)
10045     {
10046       aarch64_override_options_internal (&global_options);
10047       /* Initialize SIMD builtins if we haven't already.
10048          Set current_target_pragma to NULL for the duration so that
10049          the builtin initialization code doesn't try to tag the functions
10050          being built with the attributes specified by any current pragma, thus
10051          going into an infinite recursion.  */
10052       if (TARGET_SIMD)
10053         {
10054           tree saved_current_target_pragma = current_target_pragma;
10055           current_target_pragma = NULL;
10056           aarch64_init_simd_builtins ();
10057           current_target_pragma = saved_current_target_pragma;
10058         }
10059       new_target = build_target_option_node (&global_options);
10060     }
10061   else
10062     new_target = NULL;
10063
10064   new_optimize = build_optimization_node (&global_options);
10065
10066   if (fndecl && ret)
10067     {
10068       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10069
10070       if (old_optimize != new_optimize)
10071         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10072     }
10073
10074   cl_target_option_restore (&global_options, &cur_target);
10075
10076   if (old_optimize != new_optimize)
10077     cl_optimization_restore (&global_options,
10078                              TREE_OPTIMIZATION (old_optimize));
10079   return ret;
10080 }
10081
10082 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10083    tri-bool options (yes, no, don't care) and the default value is
10084    DEF, determine whether to reject inlining.  */
10085
10086 static bool
10087 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10088                                      int dont_care, int def)
10089 {
10090   /* If the callee doesn't care, always allow inlining.  */
10091   if (callee == dont_care)
10092     return true;
10093
10094   /* If the caller doesn't care, always allow inlining.  */
10095   if (caller == dont_care)
10096     return true;
10097
10098   /* Otherwise, allow inlining if either the callee and caller values
10099      agree, or if the callee is using the default value.  */
10100   return (callee == caller || callee == def);
10101 }
10102
10103 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10104    to inline CALLEE into CALLER based on target-specific info.
10105    Make sure that the caller and callee have compatible architectural
10106    features.  Then go through the other possible target attributes
10107    and see if they can block inlining.  Try not to reject always_inline
10108    callees unless they are incompatible architecturally.  */
10109
10110 static bool
10111 aarch64_can_inline_p (tree caller, tree callee)
10112 {
10113   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10114   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10115
10116   /* If callee has no option attributes, then it is ok to inline.  */
10117   if (!callee_tree)
10118     return true;
10119
10120   struct cl_target_option *caller_opts
10121         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10122                                            : target_option_default_node);
10123
10124   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10125
10126
10127   /* Callee's ISA flags should be a subset of the caller's.  */
10128   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10129        != callee_opts->x_aarch64_isa_flags)
10130     return false;
10131
10132   /* Allow non-strict aligned functions inlining into strict
10133      aligned ones.  */
10134   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10135        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10136       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10137            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10138     return false;
10139
10140   bool always_inline = lookup_attribute ("always_inline",
10141                                           DECL_ATTRIBUTES (callee));
10142
10143   /* If the architectural features match up and the callee is always_inline
10144      then the other attributes don't matter.  */
10145   if (always_inline)
10146     return true;
10147
10148   if (caller_opts->x_aarch64_cmodel_var
10149       != callee_opts->x_aarch64_cmodel_var)
10150     return false;
10151
10152   if (caller_opts->x_aarch64_tls_dialect
10153       != callee_opts->x_aarch64_tls_dialect)
10154     return false;
10155
10156   /* Honour explicit requests to workaround errata.  */
10157   if (!aarch64_tribools_ok_for_inlining_p (
10158           caller_opts->x_aarch64_fix_a53_err835769,
10159           callee_opts->x_aarch64_fix_a53_err835769,
10160           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10161     return false;
10162
10163   if (!aarch64_tribools_ok_for_inlining_p (
10164           caller_opts->x_aarch64_fix_a53_err843419,
10165           callee_opts->x_aarch64_fix_a53_err843419,
10166           2, TARGET_FIX_ERR_A53_843419))
10167     return false;
10168
10169   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10170      caller and calle and they don't match up, reject inlining.  */
10171   if (!aarch64_tribools_ok_for_inlining_p (
10172           caller_opts->x_flag_omit_leaf_frame_pointer,
10173           callee_opts->x_flag_omit_leaf_frame_pointer,
10174           2, 1))
10175     return false;
10176
10177   /* If the callee has specific tuning overrides, respect them.  */
10178   if (callee_opts->x_aarch64_override_tune_string != NULL
10179       && caller_opts->x_aarch64_override_tune_string == NULL)
10180     return false;
10181
10182   /* If the user specified tuning override strings for the
10183      caller and callee and they don't match up, reject inlining.
10184      We just do a string compare here, we don't analyze the meaning
10185      of the string, as it would be too costly for little gain.  */
10186   if (callee_opts->x_aarch64_override_tune_string
10187       && caller_opts->x_aarch64_override_tune_string
10188       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10189                   caller_opts->x_aarch64_override_tune_string) != 0))
10190     return false;
10191
10192   return true;
10193 }
10194
10195 /* Return true if SYMBOL_REF X binds locally.  */
10196
10197 static bool
10198 aarch64_symbol_binds_local_p (const_rtx x)
10199 {
10200   return (SYMBOL_REF_DECL (x)
10201           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10202           : SYMBOL_REF_LOCAL_P (x));
10203 }
10204
10205 /* Return true if SYMBOL_REF X is thread local */
10206 static bool
10207 aarch64_tls_symbol_p (rtx x)
10208 {
10209   if (! TARGET_HAVE_TLS)
10210     return false;
10211
10212   if (GET_CODE (x) != SYMBOL_REF)
10213     return false;
10214
10215   return SYMBOL_REF_TLS_MODEL (x) != 0;
10216 }
10217
10218 /* Classify a TLS symbol into one of the TLS kinds.  */
10219 enum aarch64_symbol_type
10220 aarch64_classify_tls_symbol (rtx x)
10221 {
10222   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10223
10224   switch (tls_kind)
10225     {
10226     case TLS_MODEL_GLOBAL_DYNAMIC:
10227     case TLS_MODEL_LOCAL_DYNAMIC:
10228       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10229
10230     case TLS_MODEL_INITIAL_EXEC:
10231       switch (aarch64_cmodel)
10232         {
10233         case AARCH64_CMODEL_TINY:
10234         case AARCH64_CMODEL_TINY_PIC:
10235           return SYMBOL_TINY_TLSIE;
10236         default:
10237           return SYMBOL_SMALL_TLSIE;
10238         }
10239
10240     case TLS_MODEL_LOCAL_EXEC:
10241       if (aarch64_tls_size == 12)
10242         return SYMBOL_TLSLE12;
10243       else if (aarch64_tls_size == 24)
10244         return SYMBOL_TLSLE24;
10245       else if (aarch64_tls_size == 32)
10246         return SYMBOL_TLSLE32;
10247       else if (aarch64_tls_size == 48)
10248         return SYMBOL_TLSLE48;
10249       else
10250         gcc_unreachable ();
10251
10252     case TLS_MODEL_EMULATED:
10253     case TLS_MODEL_NONE:
10254       return SYMBOL_FORCE_TO_MEM;
10255
10256     default:
10257       gcc_unreachable ();
10258     }
10259 }
10260
10261 /* Return the method that should be used to access SYMBOL_REF or
10262    LABEL_REF X.  */
10263
10264 enum aarch64_symbol_type
10265 aarch64_classify_symbol (rtx x, rtx offset)
10266 {
10267   if (GET_CODE (x) == LABEL_REF)
10268     {
10269       switch (aarch64_cmodel)
10270         {
10271         case AARCH64_CMODEL_LARGE:
10272           return SYMBOL_FORCE_TO_MEM;
10273
10274         case AARCH64_CMODEL_TINY_PIC:
10275         case AARCH64_CMODEL_TINY:
10276           return SYMBOL_TINY_ABSOLUTE;
10277
10278         case AARCH64_CMODEL_SMALL_SPIC:
10279         case AARCH64_CMODEL_SMALL_PIC:
10280         case AARCH64_CMODEL_SMALL:
10281           return SYMBOL_SMALL_ABSOLUTE;
10282
10283         default:
10284           gcc_unreachable ();
10285         }
10286     }
10287
10288   if (GET_CODE (x) == SYMBOL_REF)
10289     {
10290       if (aarch64_tls_symbol_p (x))
10291         return aarch64_classify_tls_symbol (x);
10292
10293       switch (aarch64_cmodel)
10294         {
10295         case AARCH64_CMODEL_TINY:
10296           /* When we retrieve symbol + offset address, we have to make sure
10297              the offset does not cause overflow of the final address.  But
10298              we have no way of knowing the address of symbol at compile time
10299              so we can't accurately say if the distance between the PC and
10300              symbol + offset is outside the addressible range of +/-1M in the
10301              TINY code model.  So we rely on images not being greater than
10302              1M and cap the offset at 1M and anything beyond 1M will have to
10303              be loaded using an alternative mechanism.  Furthermore if the
10304              symbol is a weak reference to something that isn't known to
10305              resolve to a symbol in this module, then force to memory.  */
10306           if ((SYMBOL_REF_WEAK (x)
10307                && !aarch64_symbol_binds_local_p (x))
10308               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10309             return SYMBOL_FORCE_TO_MEM;
10310           return SYMBOL_TINY_ABSOLUTE;
10311
10312         case AARCH64_CMODEL_SMALL:
10313           /* Same reasoning as the tiny code model, but the offset cap here is
10314              4G.  */
10315           if ((SYMBOL_REF_WEAK (x)
10316                && !aarch64_symbol_binds_local_p (x))
10317               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10318                             HOST_WIDE_INT_C (4294967264)))
10319             return SYMBOL_FORCE_TO_MEM;
10320           return SYMBOL_SMALL_ABSOLUTE;
10321
10322         case AARCH64_CMODEL_TINY_PIC:
10323           if (!aarch64_symbol_binds_local_p (x))
10324             return SYMBOL_TINY_GOT;
10325           return SYMBOL_TINY_ABSOLUTE;
10326
10327         case AARCH64_CMODEL_SMALL_SPIC:
10328         case AARCH64_CMODEL_SMALL_PIC:
10329           if (!aarch64_symbol_binds_local_p (x))
10330             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10331                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10332           return SYMBOL_SMALL_ABSOLUTE;
10333
10334         case AARCH64_CMODEL_LARGE:
10335           /* This is alright even in PIC code as the constant
10336              pool reference is always PC relative and within
10337              the same translation unit.  */
10338           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10339             return SYMBOL_SMALL_ABSOLUTE;
10340           else
10341             return SYMBOL_FORCE_TO_MEM;
10342
10343         default:
10344           gcc_unreachable ();
10345         }
10346     }
10347
10348   /* By default push everything into the constant pool.  */
10349   return SYMBOL_FORCE_TO_MEM;
10350 }
10351
10352 bool
10353 aarch64_constant_address_p (rtx x)
10354 {
10355   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10356 }
10357
10358 bool
10359 aarch64_legitimate_pic_operand_p (rtx x)
10360 {
10361   if (GET_CODE (x) == SYMBOL_REF
10362       || (GET_CODE (x) == CONST
10363           && GET_CODE (XEXP (x, 0)) == PLUS
10364           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10365      return false;
10366
10367   return true;
10368 }
10369
10370 /* Return true if X holds either a quarter-precision or
10371      floating-point +0.0 constant.  */
10372 static bool
10373 aarch64_valid_floating_const (rtx x)
10374 {
10375   if (!CONST_DOUBLE_P (x))
10376     return false;
10377
10378   /* This call determines which constants can be used in mov<mode>
10379      as integer moves instead of constant loads.  */
10380   if (aarch64_float_const_rtx_p (x))
10381     return true;
10382
10383   return aarch64_float_const_representable_p (x);
10384 }
10385
10386 static bool
10387 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10388 {
10389   /* Do not allow vector struct mode constants.  We could support
10390      0 and -1 easily, but they need support in aarch64-simd.md.  */
10391   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10392     return false;
10393
10394   /* For these cases we never want to use a literal load.
10395      As such we have to prevent the compiler from forcing these
10396      to memory.  */
10397   if ((GET_CODE (x) == CONST_VECTOR
10398        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10399       || CONST_INT_P (x)
10400       || aarch64_valid_floating_const (x)
10401       || aarch64_can_const_movi_rtx_p (x, mode)
10402       || aarch64_float_const_rtx_p (x))
10403         return !targetm.cannot_force_const_mem (mode, x);
10404
10405   if (GET_CODE (x) == HIGH
10406       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10407     return true;
10408
10409   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10410      so spilling them is better than rematerialization.  */
10411   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10412     return true;
10413
10414   return aarch64_constant_address_p (x);
10415 }
10416
10417 rtx
10418 aarch64_load_tp (rtx target)
10419 {
10420   if (!target
10421       || GET_MODE (target) != Pmode
10422       || !register_operand (target, Pmode))
10423     target = gen_reg_rtx (Pmode);
10424
10425   /* Can return in any reg.  */
10426   emit_insn (gen_aarch64_load_tp_hard (target));
10427   return target;
10428 }
10429
10430 /* On AAPCS systems, this is the "struct __va_list".  */
10431 static GTY(()) tree va_list_type;
10432
10433 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10434    Return the type to use as __builtin_va_list.
10435
10436    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10437
10438    struct __va_list
10439    {
10440      void *__stack;
10441      void *__gr_top;
10442      void *__vr_top;
10443      int   __gr_offs;
10444      int   __vr_offs;
10445    };  */
10446
10447 static tree
10448 aarch64_build_builtin_va_list (void)
10449 {
10450   tree va_list_name;
10451   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10452
10453   /* Create the type.  */
10454   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10455   /* Give it the required name.  */
10456   va_list_name = build_decl (BUILTINS_LOCATION,
10457                              TYPE_DECL,
10458                              get_identifier ("__va_list"),
10459                              va_list_type);
10460   DECL_ARTIFICIAL (va_list_name) = 1;
10461   TYPE_NAME (va_list_type) = va_list_name;
10462   TYPE_STUB_DECL (va_list_type) = va_list_name;
10463
10464   /* Create the fields.  */
10465   f_stack = build_decl (BUILTINS_LOCATION,
10466                         FIELD_DECL, get_identifier ("__stack"),
10467                         ptr_type_node);
10468   f_grtop = build_decl (BUILTINS_LOCATION,
10469                         FIELD_DECL, get_identifier ("__gr_top"),
10470                         ptr_type_node);
10471   f_vrtop = build_decl (BUILTINS_LOCATION,
10472                         FIELD_DECL, get_identifier ("__vr_top"),
10473                         ptr_type_node);
10474   f_groff = build_decl (BUILTINS_LOCATION,
10475                         FIELD_DECL, get_identifier ("__gr_offs"),
10476                         integer_type_node);
10477   f_vroff = build_decl (BUILTINS_LOCATION,
10478                         FIELD_DECL, get_identifier ("__vr_offs"),
10479                         integer_type_node);
10480
10481   /* Tell tree-stdarg pass about our internal offset fields.
10482      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10483      purpose to identify whether the code is updating va_list internal
10484      offset fields through irregular way.  */
10485   va_list_gpr_counter_field = f_groff;
10486   va_list_fpr_counter_field = f_vroff;
10487
10488   DECL_ARTIFICIAL (f_stack) = 1;
10489   DECL_ARTIFICIAL (f_grtop) = 1;
10490   DECL_ARTIFICIAL (f_vrtop) = 1;
10491   DECL_ARTIFICIAL (f_groff) = 1;
10492   DECL_ARTIFICIAL (f_vroff) = 1;
10493
10494   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10495   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10496   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10497   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10498   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10499
10500   TYPE_FIELDS (va_list_type) = f_stack;
10501   DECL_CHAIN (f_stack) = f_grtop;
10502   DECL_CHAIN (f_grtop) = f_vrtop;
10503   DECL_CHAIN (f_vrtop) = f_groff;
10504   DECL_CHAIN (f_groff) = f_vroff;
10505
10506   /* Compute its layout.  */
10507   layout_type (va_list_type);
10508
10509   return va_list_type;
10510 }
10511
10512 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10513 static void
10514 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10515 {
10516   const CUMULATIVE_ARGS *cum;
10517   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10518   tree stack, grtop, vrtop, groff, vroff;
10519   tree t;
10520   int gr_save_area_size = cfun->va_list_gpr_size;
10521   int vr_save_area_size = cfun->va_list_fpr_size;
10522   int vr_offset;
10523
10524   cum = &crtl->args.info;
10525   if (cfun->va_list_gpr_size)
10526     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10527                              cfun->va_list_gpr_size);
10528   if (cfun->va_list_fpr_size)
10529     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10530                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10531
10532   if (!TARGET_FLOAT)
10533     {
10534       gcc_assert (cum->aapcs_nvrn == 0);
10535       vr_save_area_size = 0;
10536     }
10537
10538   f_stack = TYPE_FIELDS (va_list_type_node);
10539   f_grtop = DECL_CHAIN (f_stack);
10540   f_vrtop = DECL_CHAIN (f_grtop);
10541   f_groff = DECL_CHAIN (f_vrtop);
10542   f_vroff = DECL_CHAIN (f_groff);
10543
10544   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10545                   NULL_TREE);
10546   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10547                   NULL_TREE);
10548   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10549                   NULL_TREE);
10550   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10551                   NULL_TREE);
10552   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10553                   NULL_TREE);
10554
10555   /* Emit code to initialize STACK, which points to the next varargs stack
10556      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10557      by named arguments.  STACK is 8-byte aligned.  */
10558   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10559   if (cum->aapcs_stack_size > 0)
10560     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10561   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10562   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10563
10564   /* Emit code to initialize GRTOP, the top of the GR save area.
10565      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10566   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10567   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10568   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10569
10570   /* Emit code to initialize VRTOP, the top of the VR save area.
10571      This address is gr_save_area_bytes below GRTOP, rounded
10572      down to the next 16-byte boundary.  */
10573   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10574   vr_offset = ROUND_UP (gr_save_area_size,
10575                         STACK_BOUNDARY / BITS_PER_UNIT);
10576
10577   if (vr_offset)
10578     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10579   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10580   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10581
10582   /* Emit code to initialize GROFF, the offset from GRTOP of the
10583      next GPR argument.  */
10584   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10585               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10586   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10587
10588   /* Likewise emit code to initialize VROFF, the offset from FTOP
10589      of the next VR argument.  */
10590   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10591               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10592   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10593 }
10594
10595 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10596
10597 static tree
10598 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10599                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10600 {
10601   tree addr;
10602   bool indirect_p;
10603   bool is_ha;           /* is HFA or HVA.  */
10604   bool dw_align;        /* double-word align.  */
10605   machine_mode ag_mode = VOIDmode;
10606   int nregs;
10607   machine_mode mode;
10608
10609   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10610   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10611   HOST_WIDE_INT size, rsize, adjust, align;
10612   tree t, u, cond1, cond2;
10613
10614   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10615   if (indirect_p)
10616     type = build_pointer_type (type);
10617
10618   mode = TYPE_MODE (type);
10619
10620   f_stack = TYPE_FIELDS (va_list_type_node);
10621   f_grtop = DECL_CHAIN (f_stack);
10622   f_vrtop = DECL_CHAIN (f_grtop);
10623   f_groff = DECL_CHAIN (f_vrtop);
10624   f_vroff = DECL_CHAIN (f_groff);
10625
10626   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10627                   f_stack, NULL_TREE);
10628   size = int_size_in_bytes (type);
10629   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10630
10631   dw_align = false;
10632   adjust = 0;
10633   if (aarch64_vfp_is_call_or_return_candidate (mode,
10634                                                type,
10635                                                &ag_mode,
10636                                                &nregs,
10637                                                &is_ha))
10638     {
10639       /* TYPE passed in fp/simd registers.  */
10640       if (!TARGET_FLOAT)
10641         aarch64_err_no_fpadvsimd (mode, "varargs");
10642
10643       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10644                       unshare_expr (valist), f_vrtop, NULL_TREE);
10645       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10646                       unshare_expr (valist), f_vroff, NULL_TREE);
10647
10648       rsize = nregs * UNITS_PER_VREG;
10649
10650       if (is_ha)
10651         {
10652           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10653             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10654         }
10655       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10656                && size < UNITS_PER_VREG)
10657         {
10658           adjust = UNITS_PER_VREG - size;
10659         }
10660     }
10661   else
10662     {
10663       /* TYPE passed in general registers.  */
10664       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10665                       unshare_expr (valist), f_grtop, NULL_TREE);
10666       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10667                       unshare_expr (valist), f_groff, NULL_TREE);
10668       rsize = ROUND_UP (size, UNITS_PER_WORD);
10669       nregs = rsize / UNITS_PER_WORD;
10670
10671       if (align > 8)
10672         dw_align = true;
10673
10674       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10675           && size < UNITS_PER_WORD)
10676         {
10677           adjust = UNITS_PER_WORD  - size;
10678         }
10679     }
10680
10681   /* Get a local temporary for the field value.  */
10682   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10683
10684   /* Emit code to branch if off >= 0.  */
10685   t = build2 (GE_EXPR, boolean_type_node, off,
10686               build_int_cst (TREE_TYPE (off), 0));
10687   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10688
10689   if (dw_align)
10690     {
10691       /* Emit: offs = (offs + 15) & -16.  */
10692       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10693                   build_int_cst (TREE_TYPE (off), 15));
10694       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10695                   build_int_cst (TREE_TYPE (off), -16));
10696       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10697     }
10698   else
10699     roundup = NULL;
10700
10701   /* Update ap.__[g|v]r_offs  */
10702   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10703               build_int_cst (TREE_TYPE (off), rsize));
10704   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10705
10706   /* String up.  */
10707   if (roundup)
10708     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10709
10710   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10711   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10712               build_int_cst (TREE_TYPE (f_off), 0));
10713   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10714
10715   /* String up: make sure the assignment happens before the use.  */
10716   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10717   COND_EXPR_ELSE (cond1) = t;
10718
10719   /* Prepare the trees handling the argument that is passed on the stack;
10720      the top level node will store in ON_STACK.  */
10721   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10722   if (align > 8)
10723     {
10724       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10725       t = fold_convert (intDI_type_node, arg);
10726       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10727                   build_int_cst (TREE_TYPE (t), 15));
10728       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10729                   build_int_cst (TREE_TYPE (t), -16));
10730       t = fold_convert (TREE_TYPE (arg), t);
10731       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10732     }
10733   else
10734     roundup = NULL;
10735   /* Advance ap.__stack  */
10736   t = fold_convert (intDI_type_node, arg);
10737   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10738               build_int_cst (TREE_TYPE (t), size + 7));
10739   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10740               build_int_cst (TREE_TYPE (t), -8));
10741   t = fold_convert (TREE_TYPE (arg), t);
10742   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10743   /* String up roundup and advance.  */
10744   if (roundup)
10745     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10746   /* String up with arg */
10747   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10748   /* Big-endianness related address adjustment.  */
10749   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10750       && size < UNITS_PER_WORD)
10751   {
10752     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10753                 size_int (UNITS_PER_WORD - size));
10754     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10755   }
10756
10757   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10758   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10759
10760   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10761   t = off;
10762   if (adjust)
10763     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10764                 build_int_cst (TREE_TYPE (off), adjust));
10765
10766   t = fold_convert (sizetype, t);
10767   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10768
10769   if (is_ha)
10770     {
10771       /* type ha; // treat as "struct {ftype field[n];}"
10772          ... [computing offs]
10773          for (i = 0; i <nregs; ++i, offs += 16)
10774            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10775          return ha;  */
10776       int i;
10777       tree tmp_ha, field_t, field_ptr_t;
10778
10779       /* Declare a local variable.  */
10780       tmp_ha = create_tmp_var_raw (type, "ha");
10781       gimple_add_tmp_var (tmp_ha);
10782
10783       /* Establish the base type.  */
10784       switch (ag_mode)
10785         {
10786         case E_SFmode:
10787           field_t = float_type_node;
10788           field_ptr_t = float_ptr_type_node;
10789           break;
10790         case E_DFmode:
10791           field_t = double_type_node;
10792           field_ptr_t = double_ptr_type_node;
10793           break;
10794         case E_TFmode:
10795           field_t = long_double_type_node;
10796           field_ptr_t = long_double_ptr_type_node;
10797           break;
10798         case E_HFmode:
10799           field_t = aarch64_fp16_type_node;
10800           field_ptr_t = aarch64_fp16_ptr_type_node;
10801           break;
10802         case E_V2SImode:
10803         case E_V4SImode:
10804             {
10805               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10806               field_t = build_vector_type_for_mode (innertype, ag_mode);
10807               field_ptr_t = build_pointer_type (field_t);
10808             }
10809           break;
10810         default:
10811           gcc_assert (0);
10812         }
10813
10814       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10815       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10816       addr = t;
10817       t = fold_convert (field_ptr_t, addr);
10818       t = build2 (MODIFY_EXPR, field_t,
10819                   build1 (INDIRECT_REF, field_t, tmp_ha),
10820                   build1 (INDIRECT_REF, field_t, t));
10821
10822       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10823       for (i = 1; i < nregs; ++i)
10824         {
10825           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10826           u = fold_convert (field_ptr_t, addr);
10827           u = build2 (MODIFY_EXPR, field_t,
10828                       build2 (MEM_REF, field_t, tmp_ha,
10829                               build_int_cst (field_ptr_t,
10830                                              (i *
10831                                               int_size_in_bytes (field_t)))),
10832                       build1 (INDIRECT_REF, field_t, u));
10833           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10834         }
10835
10836       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10837       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10838     }
10839
10840   COND_EXPR_ELSE (cond2) = t;
10841   addr = fold_convert (build_pointer_type (type), cond1);
10842   addr = build_va_arg_indirect_ref (addr);
10843
10844   if (indirect_p)
10845     addr = build_va_arg_indirect_ref (addr);
10846
10847   return addr;
10848 }
10849
10850 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10851
10852 static void
10853 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10854                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10855                                 int no_rtl)
10856 {
10857   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10858   CUMULATIVE_ARGS local_cum;
10859   int gr_saved = cfun->va_list_gpr_size;
10860   int vr_saved = cfun->va_list_fpr_size;
10861
10862   /* The caller has advanced CUM up to, but not beyond, the last named
10863      argument.  Advance a local copy of CUM past the last "real" named
10864      argument, to find out how many registers are left over.  */
10865   local_cum = *cum;
10866   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10867
10868   /* Found out how many registers we need to save.
10869      Honor tree-stdvar analysis results.  */
10870   if (cfun->va_list_gpr_size)
10871     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10872                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10873   if (cfun->va_list_fpr_size)
10874     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10875                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10876
10877   if (!TARGET_FLOAT)
10878     {
10879       gcc_assert (local_cum.aapcs_nvrn == 0);
10880       vr_saved = 0;
10881     }
10882
10883   if (!no_rtl)
10884     {
10885       if (gr_saved > 0)
10886         {
10887           rtx ptr, mem;
10888
10889           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10890           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10891                                - gr_saved * UNITS_PER_WORD);
10892           mem = gen_frame_mem (BLKmode, ptr);
10893           set_mem_alias_set (mem, get_varargs_alias_set ());
10894
10895           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10896                                mem, gr_saved);
10897         }
10898       if (vr_saved > 0)
10899         {
10900           /* We can't use move_block_from_reg, because it will use
10901              the wrong mode, storing D regs only.  */
10902           machine_mode mode = TImode;
10903           int off, i, vr_start;
10904
10905           /* Set OFF to the offset from virtual_incoming_args_rtx of
10906              the first vector register.  The VR save area lies below
10907              the GR one, and is aligned to 16 bytes.  */
10908           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10909                            STACK_BOUNDARY / BITS_PER_UNIT);
10910           off -= vr_saved * UNITS_PER_VREG;
10911
10912           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10913           for (i = 0; i < vr_saved; ++i)
10914             {
10915               rtx ptr, mem;
10916
10917               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10918               mem = gen_frame_mem (mode, ptr);
10919               set_mem_alias_set (mem, get_varargs_alias_set ());
10920               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10921               off += UNITS_PER_VREG;
10922             }
10923         }
10924     }
10925
10926   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10927      any complication of having crtl->args.pretend_args_size changed.  */
10928   cfun->machine->frame.saved_varargs_size
10929     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10930                  STACK_BOUNDARY / BITS_PER_UNIT)
10931        + vr_saved * UNITS_PER_VREG);
10932 }
10933
10934 static void
10935 aarch64_conditional_register_usage (void)
10936 {
10937   int i;
10938   if (!TARGET_FLOAT)
10939     {
10940       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10941         {
10942           fixed_regs[i] = 1;
10943           call_used_regs[i] = 1;
10944         }
10945     }
10946 }
10947
10948 /* Walk down the type tree of TYPE counting consecutive base elements.
10949    If *MODEP is VOIDmode, then set it to the first valid floating point
10950    type.  If a non-floating point type is found, or if a floating point
10951    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10952    otherwise return the count in the sub-tree.  */
10953 static int
10954 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10955 {
10956   machine_mode mode;
10957   HOST_WIDE_INT size;
10958
10959   switch (TREE_CODE (type))
10960     {
10961     case REAL_TYPE:
10962       mode = TYPE_MODE (type);
10963       if (mode != DFmode && mode != SFmode
10964           && mode != TFmode && mode != HFmode)
10965         return -1;
10966
10967       if (*modep == VOIDmode)
10968         *modep = mode;
10969
10970       if (*modep == mode)
10971         return 1;
10972
10973       break;
10974
10975     case COMPLEX_TYPE:
10976       mode = TYPE_MODE (TREE_TYPE (type));
10977       if (mode != DFmode && mode != SFmode
10978           && mode != TFmode && mode != HFmode)
10979         return -1;
10980
10981       if (*modep == VOIDmode)
10982         *modep = mode;
10983
10984       if (*modep == mode)
10985         return 2;
10986
10987       break;
10988
10989     case VECTOR_TYPE:
10990       /* Use V2SImode and V4SImode as representatives of all 64-bit
10991          and 128-bit vector types.  */
10992       size = int_size_in_bytes (type);
10993       switch (size)
10994         {
10995         case 8:
10996           mode = V2SImode;
10997           break;
10998         case 16:
10999           mode = V4SImode;
11000           break;
11001         default:
11002           return -1;
11003         }
11004
11005       if (*modep == VOIDmode)
11006         *modep = mode;
11007
11008       /* Vector modes are considered to be opaque: two vectors are
11009          equivalent for the purposes of being homogeneous aggregates
11010          if they are the same size.  */
11011       if (*modep == mode)
11012         return 1;
11013
11014       break;
11015
11016     case ARRAY_TYPE:
11017       {
11018         int count;
11019         tree index = TYPE_DOMAIN (type);
11020
11021         /* Can't handle incomplete types nor sizes that are not
11022            fixed.  */
11023         if (!COMPLETE_TYPE_P (type)
11024             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11025           return -1;
11026
11027         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11028         if (count == -1
11029             || !index
11030             || !TYPE_MAX_VALUE (index)
11031             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11032             || !TYPE_MIN_VALUE (index)
11033             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11034             || count < 0)
11035           return -1;
11036
11037         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11038                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11039
11040         /* There must be no padding.  */
11041         if (wi::to_wide (TYPE_SIZE (type))
11042             != count * GET_MODE_BITSIZE (*modep))
11043           return -1;
11044
11045         return count;
11046       }
11047
11048     case RECORD_TYPE:
11049       {
11050         int count = 0;
11051         int sub_count;
11052         tree field;
11053
11054         /* Can't handle incomplete types nor sizes that are not
11055            fixed.  */
11056         if (!COMPLETE_TYPE_P (type)
11057             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11058           return -1;
11059
11060         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11061           {
11062             if (TREE_CODE (field) != FIELD_DECL)
11063               continue;
11064
11065             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11066             if (sub_count < 0)
11067               return -1;
11068             count += sub_count;
11069           }
11070
11071         /* There must be no padding.  */
11072         if (wi::to_wide (TYPE_SIZE (type))
11073             != count * GET_MODE_BITSIZE (*modep))
11074           return -1;
11075
11076         return count;
11077       }
11078
11079     case UNION_TYPE:
11080     case QUAL_UNION_TYPE:
11081       {
11082         /* These aren't very interesting except in a degenerate case.  */
11083         int count = 0;
11084         int sub_count;
11085         tree field;
11086
11087         /* Can't handle incomplete types nor sizes that are not
11088            fixed.  */
11089         if (!COMPLETE_TYPE_P (type)
11090             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11091           return -1;
11092
11093         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11094           {
11095             if (TREE_CODE (field) != FIELD_DECL)
11096               continue;
11097
11098             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11099             if (sub_count < 0)
11100               return -1;
11101             count = count > sub_count ? count : sub_count;
11102           }
11103
11104         /* There must be no padding.  */
11105         if (wi::to_wide (TYPE_SIZE (type))
11106             != count * GET_MODE_BITSIZE (*modep))
11107           return -1;
11108
11109         return count;
11110       }
11111
11112     default:
11113       break;
11114     }
11115
11116   return -1;
11117 }
11118
11119 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11120    type as described in AAPCS64 \S 4.1.2.
11121
11122    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11123
11124 static bool
11125 aarch64_short_vector_p (const_tree type,
11126                         machine_mode mode)
11127 {
11128   HOST_WIDE_INT size = -1;
11129
11130   if (type && TREE_CODE (type) == VECTOR_TYPE)
11131     size = int_size_in_bytes (type);
11132   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11133             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11134     size = GET_MODE_SIZE (mode);
11135
11136   return (size == 8 || size == 16);
11137 }
11138
11139 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11140    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11141    array types.  The C99 floating-point complex types are also considered
11142    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11143    types, which are GCC extensions and out of the scope of AAPCS64, are
11144    treated as composite types here as well.
11145
11146    Note that MODE itself is not sufficient in determining whether a type
11147    is such a composite type or not.  This is because
11148    stor-layout.c:compute_record_mode may have already changed the MODE
11149    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11150    structure with only one field may have its MODE set to the mode of the
11151    field.  Also an integer mode whose size matches the size of the
11152    RECORD_TYPE type may be used to substitute the original mode
11153    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11154    solely relied on.  */
11155
11156 static bool
11157 aarch64_composite_type_p (const_tree type,
11158                           machine_mode mode)
11159 {
11160   if (aarch64_short_vector_p (type, mode))
11161     return false;
11162
11163   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11164     return true;
11165
11166   if (mode == BLKmode
11167       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11168       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11169     return true;
11170
11171   return false;
11172 }
11173
11174 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11175    shall be passed or returned in simd/fp register(s) (providing these
11176    parameter passing registers are available).
11177
11178    Upon successful return, *COUNT returns the number of needed registers,
11179    *BASE_MODE returns the mode of the individual register and when IS_HAF
11180    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11181    floating-point aggregate or a homogeneous short-vector aggregate.  */
11182
11183 static bool
11184 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11185                                          const_tree type,
11186                                          machine_mode *base_mode,
11187                                          int *count,
11188                                          bool *is_ha)
11189 {
11190   machine_mode new_mode = VOIDmode;
11191   bool composite_p = aarch64_composite_type_p (type, mode);
11192
11193   if (is_ha != NULL) *is_ha = false;
11194
11195   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11196       || aarch64_short_vector_p (type, mode))
11197     {
11198       *count = 1;
11199       new_mode = mode;
11200     }
11201   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11202     {
11203       if (is_ha != NULL) *is_ha = true;
11204       *count = 2;
11205       new_mode = GET_MODE_INNER (mode);
11206     }
11207   else if (type && composite_p)
11208     {
11209       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11210
11211       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11212         {
11213           if (is_ha != NULL) *is_ha = true;
11214           *count = ag_count;
11215         }
11216       else
11217         return false;
11218     }
11219   else
11220     return false;
11221
11222   *base_mode = new_mode;
11223   return true;
11224 }
11225
11226 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11227
11228 static rtx
11229 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11230                           int incoming ATTRIBUTE_UNUSED)
11231 {
11232   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11233 }
11234
11235 /* Implements target hook vector_mode_supported_p.  */
11236 static bool
11237 aarch64_vector_mode_supported_p (machine_mode mode)
11238 {
11239   if (TARGET_SIMD
11240       && (mode == V4SImode  || mode == V8HImode
11241           || mode == V16QImode || mode == V2DImode
11242           || mode == V2SImode  || mode == V4HImode
11243           || mode == V8QImode || mode == V2SFmode
11244           || mode == V4SFmode || mode == V2DFmode
11245           || mode == V4HFmode || mode == V8HFmode
11246           || mode == V1DFmode))
11247     return true;
11248
11249   return false;
11250 }
11251
11252 /* Return appropriate SIMD container
11253    for MODE within a vector of WIDTH bits.  */
11254 static machine_mode
11255 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11256 {
11257   gcc_assert (width == 64 || width == 128);
11258   if (TARGET_SIMD)
11259     {
11260       if (width == 128)
11261         switch (mode)
11262           {
11263           case E_DFmode:
11264             return V2DFmode;
11265           case E_SFmode:
11266             return V4SFmode;
11267           case E_HFmode:
11268             return V8HFmode;
11269           case E_SImode:
11270             return V4SImode;
11271           case E_HImode:
11272             return V8HImode;
11273           case E_QImode:
11274             return V16QImode;
11275           case E_DImode:
11276             return V2DImode;
11277           default:
11278             break;
11279           }
11280       else
11281         switch (mode)
11282           {
11283           case E_SFmode:
11284             return V2SFmode;
11285           case E_HFmode:
11286             return V4HFmode;
11287           case E_SImode:
11288             return V2SImode;
11289           case E_HImode:
11290             return V4HImode;
11291           case E_QImode:
11292             return V8QImode;
11293           default:
11294             break;
11295           }
11296     }
11297   return word_mode;
11298 }
11299
11300 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11301 static machine_mode
11302 aarch64_preferred_simd_mode (scalar_mode mode)
11303 {
11304   return aarch64_simd_container_mode (mode, 128);
11305 }
11306
11307 /* Return the bitmask of possible vector sizes for the vectorizer
11308    to iterate over.  */
11309 static unsigned int
11310 aarch64_autovectorize_vector_sizes (void)
11311 {
11312   return (16 | 8);
11313 }
11314
11315 /* Implement TARGET_MANGLE_TYPE.  */
11316
11317 static const char *
11318 aarch64_mangle_type (const_tree type)
11319 {
11320   /* The AArch64 ABI documents say that "__va_list" has to be
11321      managled as if it is in the "std" namespace.  */
11322   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11323     return "St9__va_list";
11324
11325   /* Half-precision float.  */
11326   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11327     return "Dh";
11328
11329   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11330      builtin types.  */
11331   if (TYPE_NAME (type) != NULL)
11332     return aarch64_mangle_builtin_type (type);
11333
11334   /* Use the default mangling.  */
11335   return NULL;
11336 }
11337
11338 /* Find the first rtx_insn before insn that will generate an assembly
11339    instruction.  */
11340
11341 static rtx_insn *
11342 aarch64_prev_real_insn (rtx_insn *insn)
11343 {
11344   if (!insn)
11345     return NULL;
11346
11347   do
11348     {
11349       insn = prev_real_insn (insn);
11350     }
11351   while (insn && recog_memoized (insn) < 0);
11352
11353   return insn;
11354 }
11355
11356 static bool
11357 is_madd_op (enum attr_type t1)
11358 {
11359   unsigned int i;
11360   /* A number of these may be AArch32 only.  */
11361   enum attr_type mlatypes[] = {
11362     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11363     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11364     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11365   };
11366
11367   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11368     {
11369       if (t1 == mlatypes[i])
11370         return true;
11371     }
11372
11373   return false;
11374 }
11375
11376 /* Check if there is a register dependency between a load and the insn
11377    for which we hold recog_data.  */
11378
11379 static bool
11380 dep_between_memop_and_curr (rtx memop)
11381 {
11382   rtx load_reg;
11383   int opno;
11384
11385   gcc_assert (GET_CODE (memop) == SET);
11386
11387   if (!REG_P (SET_DEST (memop)))
11388     return false;
11389
11390   load_reg = SET_DEST (memop);
11391   for (opno = 1; opno < recog_data.n_operands; opno++)
11392     {
11393       rtx operand = recog_data.operand[opno];
11394       if (REG_P (operand)
11395           && reg_overlap_mentioned_p (load_reg, operand))
11396         return true;
11397
11398     }
11399   return false;
11400 }
11401
11402
11403 /* When working around the Cortex-A53 erratum 835769,
11404    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11405    instruction and has a preceding memory instruction such that a NOP
11406    should be inserted between them.  */
11407
11408 bool
11409 aarch64_madd_needs_nop (rtx_insn* insn)
11410 {
11411   enum attr_type attr_type;
11412   rtx_insn *prev;
11413   rtx body;
11414
11415   if (!TARGET_FIX_ERR_A53_835769)
11416     return false;
11417
11418   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11419     return false;
11420
11421   attr_type = get_attr_type (insn);
11422   if (!is_madd_op (attr_type))
11423     return false;
11424
11425   prev = aarch64_prev_real_insn (insn);
11426   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11427      Restore recog state to INSN to avoid state corruption.  */
11428   extract_constrain_insn_cached (insn);
11429
11430   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11431     return false;
11432
11433   body = single_set (prev);
11434
11435   /* If the previous insn is a memory op and there is no dependency between
11436      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11437      have a complex memory operation, probably a load/store pair.
11438      Be conservative for now and emit a NOP.  */
11439   if (GET_MODE (recog_data.operand[0]) == DImode
11440       && (!body || !dep_between_memop_and_curr (body)))
11441     return true;
11442
11443   return false;
11444
11445 }
11446
11447
11448 /* Implement FINAL_PRESCAN_INSN.  */
11449
11450 void
11451 aarch64_final_prescan_insn (rtx_insn *insn)
11452 {
11453   if (aarch64_madd_needs_nop (insn))
11454     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11455 }
11456
11457
11458 /* Return the equivalent letter for size.  */
11459 static char
11460 sizetochar (int size)
11461 {
11462   switch (size)
11463     {
11464     case 64: return 'd';
11465     case 32: return 's';
11466     case 16: return 'h';
11467     case 8 : return 'b';
11468     default: gcc_unreachable ();
11469     }
11470 }
11471
11472 /* Return true iff x is a uniform vector of floating-point
11473    constants, and the constant can be represented in
11474    quarter-precision form.  Note, as aarch64_float_const_representable
11475    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11476 static bool
11477 aarch64_vect_float_const_representable_p (rtx x)
11478 {
11479   rtx elt;
11480   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11481           && const_vec_duplicate_p (x, &elt)
11482           && aarch64_float_const_representable_p (elt));
11483 }
11484
11485 /* Return true for valid and false for invalid.  */
11486 bool
11487 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11488                               struct simd_immediate_info *info,
11489                               enum simd_immediate_check which)
11490 {
11491 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11492   matches = 1;                                          \
11493   for (i = 0; i < idx; i += (STRIDE))                   \
11494     if (!(TEST))                                        \
11495       matches = 0;                                      \
11496   if (matches)                                          \
11497     {                                                   \
11498       immtype = (CLASS);                                \
11499       elsize = (ELSIZE);                                \
11500       eshift = (SHIFT);                                 \
11501       emvn = (NEG);                                     \
11502       break;                                            \
11503     }
11504
11505   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11506   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11507   unsigned char bytes[16];
11508   int immtype = -1, matches;
11509   unsigned int invmask = inverse ? 0xff : 0;
11510   int eshift, emvn;
11511
11512   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11513     {
11514       if (! (aarch64_simd_imm_zero_p (op, mode)
11515              || aarch64_vect_float_const_representable_p (op)))
11516         return false;
11517
11518       if (info)
11519         {
11520           rtx elt = CONST_VECTOR_ELT (op, 0);
11521           scalar_float_mode elt_mode
11522             = as_a <scalar_float_mode> (GET_MODE (elt));
11523
11524           info->value = elt;
11525           info->element_width = GET_MODE_BITSIZE (elt_mode);
11526           info->mvn = false;
11527           info->shift = 0;
11528         }
11529
11530       return true;
11531     }
11532
11533   /* Splat vector constant out into a byte vector.  */
11534   for (i = 0; i < n_elts; i++)
11535     {
11536       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11537          it must be laid out in the vector register in reverse order.  */
11538       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11539       unsigned HOST_WIDE_INT elpart;
11540
11541       gcc_assert (CONST_INT_P (el));
11542       elpart = INTVAL (el);
11543
11544       for (unsigned int byte = 0; byte < innersize; byte++)
11545         {
11546           bytes[idx++] = (elpart & 0xff) ^ invmask;
11547           elpart >>= BITS_PER_UNIT;
11548         }
11549
11550     }
11551
11552   /* Sanity check.  */
11553   gcc_assert (idx == GET_MODE_SIZE (mode));
11554
11555   do
11556     {
11557       if (which & AARCH64_CHECK_ORR)
11558         {
11559           CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11560                  && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11561
11562           CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11563                  && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11564
11565           CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11566                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11567
11568           CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11569                  && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11570
11571           CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11572
11573           CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11574         }
11575
11576       if (which & AARCH64_CHECK_BIC)
11577         {
11578           CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11579                  && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11580
11581           CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11582                  && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11583
11584           CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11585                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11586
11587           CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11588                  && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11589
11590           CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11591
11592           CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11593         }
11594
11595       /* Shifting ones / 8-bit / 64-bit variants only checked
11596          for 'ALL' (MOVI/MVNI).  */
11597       if (which == AARCH64_CHECK_MOV)
11598         {
11599           CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11600                  && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11601
11602           CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11603                  && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11604
11605           CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11606                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11607
11608           CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11609                  && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11610
11611           CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11612
11613           CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11614                  && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11615         }
11616     }
11617   while (0);
11618
11619   if (immtype == -1)
11620     return false;
11621
11622   if (info)
11623     {
11624       info->element_width = elsize;
11625       info->mvn = emvn != 0;
11626       info->shift = eshift;
11627
11628       unsigned HOST_WIDE_INT imm = 0;
11629
11630       if (immtype >= 12 && immtype <= 15)
11631         info->msl = true;
11632
11633       /* Un-invert bytes of recognized vector, if necessary.  */
11634       if (invmask != 0)
11635         for (i = 0; i < idx; i++)
11636           bytes[i] ^= invmask;
11637
11638       if (immtype == 17)
11639         {
11640           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11641           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11642
11643           for (i = 0; i < 8; i++)
11644             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11645               << (i * BITS_PER_UNIT);
11646
11647
11648           info->value = GEN_INT (imm);
11649         }
11650       else
11651         {
11652           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11653             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11654
11655           /* Construct 'abcdefgh' because the assembler cannot handle
11656              generic constants.  */
11657           if (info->mvn)
11658             imm = ~imm;
11659           imm = (imm >> info->shift) & 0xff;
11660           info->value = GEN_INT (imm);
11661         }
11662     }
11663
11664   return true;
11665 #undef CHECK
11666 }
11667
11668 /* Check of immediate shift constants are within range.  */
11669 bool
11670 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11671 {
11672   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11673   if (left)
11674     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11675   else
11676     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11677 }
11678
11679 /* Return true if X is a uniform vector where all elements
11680    are either the floating-point constant 0.0 or the
11681    integer constant 0.  */
11682 bool
11683 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11684 {
11685   return x == CONST0_RTX (mode);
11686 }
11687
11688
11689 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11690    operation of width WIDTH at bit position POS.  */
11691
11692 rtx
11693 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11694 {
11695   gcc_assert (CONST_INT_P (width));
11696   gcc_assert (CONST_INT_P (pos));
11697
11698   unsigned HOST_WIDE_INT mask
11699     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11700   return GEN_INT (mask << UINTVAL (pos));
11701 }
11702
11703 bool
11704 aarch64_mov_operand_p (rtx x, machine_mode mode)
11705 {
11706   if (GET_CODE (x) == HIGH
11707       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11708     return true;
11709
11710   if (CONST_INT_P (x))
11711     return true;
11712
11713   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11714     return true;
11715
11716   return aarch64_classify_symbolic_expression (x)
11717     == SYMBOL_TINY_ABSOLUTE;
11718 }
11719
11720 /* Return a const_int vector of VAL.  */
11721 rtx
11722 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11723 {
11724   int nunits = GET_MODE_NUNITS (mode);
11725   rtvec v = rtvec_alloc (nunits);
11726   int i;
11727
11728   rtx cache = GEN_INT (val);
11729
11730   for (i=0; i < nunits; i++)
11731     RTVEC_ELT (v, i) = cache;
11732
11733   return gen_rtx_CONST_VECTOR (mode, v);
11734 }
11735
11736 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11737
11738 bool
11739 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11740 {
11741   machine_mode vmode;
11742
11743   vmode = aarch64_preferred_simd_mode (mode);
11744   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11745   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11746 }
11747
11748 /* Construct and return a PARALLEL RTX vector with elements numbering the
11749    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11750    the vector - from the perspective of the architecture.  This does not
11751    line up with GCC's perspective on lane numbers, so we end up with
11752    different masks depending on our target endian-ness.  The diagram
11753    below may help.  We must draw the distinction when building masks
11754    which select one half of the vector.  An instruction selecting
11755    architectural low-lanes for a big-endian target, must be described using
11756    a mask selecting GCC high-lanes.
11757
11758                  Big-Endian             Little-Endian
11759
11760 GCC             0   1   2   3           3   2   1   0
11761               | x | x | x | x |       | x | x | x | x |
11762 Architecture    3   2   1   0           3   2   1   0
11763
11764 Low Mask:         { 2, 3 }                { 0, 1 }
11765 High Mask:        { 0, 1 }                { 2, 3 }
11766 */
11767
11768 rtx
11769 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11770 {
11771   int nunits = GET_MODE_NUNITS (mode);
11772   rtvec v = rtvec_alloc (nunits / 2);
11773   int high_base = nunits / 2;
11774   int low_base = 0;
11775   int base;
11776   rtx t1;
11777   int i;
11778
11779   if (BYTES_BIG_ENDIAN)
11780     base = high ? low_base : high_base;
11781   else
11782     base = high ? high_base : low_base;
11783
11784   for (i = 0; i < nunits / 2; i++)
11785     RTVEC_ELT (v, i) = GEN_INT (base + i);
11786
11787   t1 = gen_rtx_PARALLEL (mode, v);
11788   return t1;
11789 }
11790
11791 /* Check OP for validity as a PARALLEL RTX vector with elements
11792    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11793    from the perspective of the architecture.  See the diagram above
11794    aarch64_simd_vect_par_cnst_half for more details.  */
11795
11796 bool
11797 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11798                                        bool high)
11799 {
11800   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11801   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11802   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11803   int i = 0;
11804
11805   if (!VECTOR_MODE_P (mode))
11806     return false;
11807
11808   if (count_op != count_ideal)
11809     return false;
11810
11811   for (i = 0; i < count_ideal; i++)
11812     {
11813       rtx elt_op = XVECEXP (op, 0, i);
11814       rtx elt_ideal = XVECEXP (ideal, 0, i);
11815
11816       if (!CONST_INT_P (elt_op)
11817           || INTVAL (elt_ideal) != INTVAL (elt_op))
11818         return false;
11819     }
11820   return true;
11821 }
11822
11823 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11824    HIGH (exclusive).  */
11825 void
11826 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11827                           const_tree exp)
11828 {
11829   HOST_WIDE_INT lane;
11830   gcc_assert (CONST_INT_P (operand));
11831   lane = INTVAL (operand);
11832
11833   if (lane < low || lane >= high)
11834   {
11835     if (exp)
11836       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11837     else
11838       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11839   }
11840 }
11841
11842 /* Return TRUE if OP is a valid vector addressing mode.  */
11843 bool
11844 aarch64_simd_mem_operand_p (rtx op)
11845 {
11846   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11847                         || REG_P (XEXP (op, 0)));
11848 }
11849
11850 /* Emit a register copy from operand to operand, taking care not to
11851    early-clobber source registers in the process.
11852
11853    COUNT is the number of components into which the copy needs to be
11854    decomposed.  */
11855 void
11856 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11857                                 unsigned int count)
11858 {
11859   unsigned int i;
11860   int rdest = REGNO (operands[0]);
11861   int rsrc = REGNO (operands[1]);
11862
11863   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11864       || rdest < rsrc)
11865     for (i = 0; i < count; i++)
11866       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11867                       gen_rtx_REG (mode, rsrc + i));
11868   else
11869     for (i = 0; i < count; i++)
11870       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11871                       gen_rtx_REG (mode, rsrc + count - i - 1));
11872 }
11873
11874 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11875    one of VSTRUCT modes: OI, CI, or XI.  */
11876 int
11877 aarch64_simd_attr_length_rglist (machine_mode mode)
11878 {
11879   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11880 }
11881
11882 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11883    alignment of a vector to 128 bits.  */
11884 static HOST_WIDE_INT
11885 aarch64_simd_vector_alignment (const_tree type)
11886 {
11887   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11888   return MIN (align, 128);
11889 }
11890
11891 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11892 static bool
11893 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11894 {
11895   if (is_packed)
11896     return false;
11897
11898   /* We guarantee alignment for vectors up to 128-bits.  */
11899   if (tree_int_cst_compare (TYPE_SIZE (type),
11900                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11901     return false;
11902
11903   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11904   return true;
11905 }
11906
11907 /* Return true if the vector misalignment factor is supported by the
11908    target.  */
11909 static bool
11910 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11911                                              const_tree type, int misalignment,
11912                                              bool is_packed)
11913 {
11914   if (TARGET_SIMD && STRICT_ALIGNMENT)
11915     {
11916       /* Return if movmisalign pattern is not supported for this mode.  */
11917       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11918         return false;
11919
11920       /* Misalignment factor is unknown at compile time.  */
11921       if (misalignment == -1)
11922         return false;
11923     }
11924   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11925                                                       is_packed);
11926 }
11927
11928 /* If VALS is a vector constant that can be loaded into a register
11929    using DUP, generate instructions to do so and return an RTX to
11930    assign to the register.  Otherwise return NULL_RTX.  */
11931 static rtx
11932 aarch64_simd_dup_constant (rtx vals)
11933 {
11934   machine_mode mode = GET_MODE (vals);
11935   machine_mode inner_mode = GET_MODE_INNER (mode);
11936   rtx x;
11937
11938   if (!const_vec_duplicate_p (vals, &x))
11939     return NULL_RTX;
11940
11941   /* We can load this constant by using DUP and a constant in a
11942      single ARM register.  This will be cheaper than a vector
11943      load.  */
11944   x = copy_to_mode_reg (inner_mode, x);
11945   return gen_rtx_VEC_DUPLICATE (mode, x);
11946 }
11947
11948
11949 /* Generate code to load VALS, which is a PARALLEL containing only
11950    constants (for vec_init) or CONST_VECTOR, efficiently into a
11951    register.  Returns an RTX to copy into the register, or NULL_RTX
11952    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11953 static rtx
11954 aarch64_simd_make_constant (rtx vals)
11955 {
11956   machine_mode mode = GET_MODE (vals);
11957   rtx const_dup;
11958   rtx const_vec = NULL_RTX;
11959   int n_elts = GET_MODE_NUNITS (mode);
11960   int n_const = 0;
11961   int i;
11962
11963   if (GET_CODE (vals) == CONST_VECTOR)
11964     const_vec = vals;
11965   else if (GET_CODE (vals) == PARALLEL)
11966     {
11967       /* A CONST_VECTOR must contain only CONST_INTs and
11968          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11969          Only store valid constants in a CONST_VECTOR.  */
11970       for (i = 0; i < n_elts; ++i)
11971         {
11972           rtx x = XVECEXP (vals, 0, i);
11973           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11974             n_const++;
11975         }
11976       if (n_const == n_elts)
11977         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11978     }
11979   else
11980     gcc_unreachable ();
11981
11982   if (const_vec != NULL_RTX
11983       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11984     /* Load using MOVI/MVNI.  */
11985     return const_vec;
11986   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11987     /* Loaded using DUP.  */
11988     return const_dup;
11989   else if (const_vec != NULL_RTX)
11990     /* Load from constant pool. We can not take advantage of single-cycle
11991        LD1 because we need a PC-relative addressing mode.  */
11992     return const_vec;
11993   else
11994     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11995        We can not construct an initializer.  */
11996     return NULL_RTX;
11997 }
11998
11999 /* Expand a vector initialisation sequence, such that TARGET is
12000    initialised to contain VALS.  */
12001
12002 void
12003 aarch64_expand_vector_init (rtx target, rtx vals)
12004 {
12005   machine_mode mode = GET_MODE (target);
12006   scalar_mode inner_mode = GET_MODE_INNER (mode);
12007   /* The number of vector elements.  */
12008   int n_elts = GET_MODE_NUNITS (mode);
12009   /* The number of vector elements which are not constant.  */
12010   int n_var = 0;
12011   rtx any_const = NULL_RTX;
12012   /* The first element of vals.  */
12013   rtx v0 = XVECEXP (vals, 0, 0);
12014   bool all_same = true;
12015
12016   /* Count the number of variable elements to initialise.  */
12017   for (int i = 0; i < n_elts; ++i)
12018     {
12019       rtx x = XVECEXP (vals, 0, i);
12020       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12021         ++n_var;
12022       else
12023         any_const = x;
12024
12025       all_same &= rtx_equal_p (x, v0);
12026     }
12027
12028   /* No variable elements, hand off to aarch64_simd_make_constant which knows
12029      how best to handle this.  */
12030   if (n_var == 0)
12031     {
12032       rtx constant = aarch64_simd_make_constant (vals);
12033       if (constant != NULL_RTX)
12034         {
12035           emit_move_insn (target, constant);
12036           return;
12037         }
12038     }
12039
12040   /* Splat a single non-constant element if we can.  */
12041   if (all_same)
12042     {
12043       rtx x = copy_to_mode_reg (inner_mode, v0);
12044       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12045       return;
12046     }
12047
12048   enum insn_code icode = optab_handler (vec_set_optab, mode);
12049   gcc_assert (icode != CODE_FOR_nothing);
12050
12051   /* If there are only variable elements, try to optimize
12052      the insertion using dup for the most common element
12053      followed by insertions.  */
12054
12055   /* The algorithm will fill matches[*][0] with the earliest matching element,
12056      and matches[X][1] with the count of duplicate elements (if X is the
12057      earliest element which has duplicates).  */
12058
12059   if (n_var == n_elts && n_elts <= 16)
12060     {
12061       int matches[16][2] = {0};
12062       for (int i = 0; i < n_elts; i++)
12063         {
12064           for (int j = 0; j <= i; j++)
12065             {
12066               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12067                 {
12068                   matches[i][0] = j;
12069                   matches[j][1]++;
12070                   break;
12071                 }
12072             }
12073         }
12074       int maxelement = 0;
12075       int maxv = 0;
12076       for (int i = 0; i < n_elts; i++)
12077         if (matches[i][1] > maxv)
12078           {
12079             maxelement = i;
12080             maxv = matches[i][1];
12081           }
12082
12083       /* Create a duplicate of the most common element.  */
12084       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12085       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12086
12087       /* Insert the rest.  */
12088       for (int i = 0; i < n_elts; i++)
12089         {
12090           rtx x = XVECEXP (vals, 0, i);
12091           if (matches[i][0] == maxelement)
12092             continue;
12093           x = copy_to_mode_reg (inner_mode, x);
12094           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12095         }
12096       return;
12097     }
12098
12099   /* Initialise a vector which is part-variable.  We want to first try
12100      to build those lanes which are constant in the most efficient way we
12101      can.  */
12102   if (n_var != n_elts)
12103     {
12104       rtx copy = copy_rtx (vals);
12105
12106       /* Load constant part of vector.  We really don't care what goes into the
12107          parts we will overwrite, but we're more likely to be able to load the
12108          constant efficiently if it has fewer, larger, repeating parts
12109          (see aarch64_simd_valid_immediate).  */
12110       for (int i = 0; i < n_elts; i++)
12111         {
12112           rtx x = XVECEXP (vals, 0, i);
12113           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12114             continue;
12115           rtx subst = any_const;
12116           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12117             {
12118               /* Look in the copied vector, as more elements are const.  */
12119               rtx test = XVECEXP (copy, 0, i ^ bit);
12120               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12121                 {
12122                   subst = test;
12123                   break;
12124                 }
12125             }
12126           XVECEXP (copy, 0, i) = subst;
12127         }
12128       aarch64_expand_vector_init (target, copy);
12129     }
12130
12131   /* Insert the variable lanes directly.  */
12132   for (int i = 0; i < n_elts; i++)
12133     {
12134       rtx x = XVECEXP (vals, 0, i);
12135       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12136         continue;
12137       x = copy_to_mode_reg (inner_mode, x);
12138       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12139     }
12140 }
12141
12142 static unsigned HOST_WIDE_INT
12143 aarch64_shift_truncation_mask (machine_mode mode)
12144 {
12145   return
12146     (!SHIFT_COUNT_TRUNCATED
12147      || aarch64_vector_mode_supported_p (mode)
12148      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12149 }
12150
12151 /* Select a format to encode pointers in exception handling data.  */
12152 int
12153 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12154 {
12155    int type;
12156    switch (aarch64_cmodel)
12157      {
12158      case AARCH64_CMODEL_TINY:
12159      case AARCH64_CMODEL_TINY_PIC:
12160      case AARCH64_CMODEL_SMALL:
12161      case AARCH64_CMODEL_SMALL_PIC:
12162      case AARCH64_CMODEL_SMALL_SPIC:
12163        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12164           for everything.  */
12165        type = DW_EH_PE_sdata4;
12166        break;
12167      default:
12168        /* No assumptions here.  8-byte relocs required.  */
12169        type = DW_EH_PE_sdata8;
12170        break;
12171      }
12172    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12173 }
12174
12175 /* The last .arch and .tune assembly strings that we printed.  */
12176 static std::string aarch64_last_printed_arch_string;
12177 static std::string aarch64_last_printed_tune_string;
12178
12179 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12180    by the function fndecl.  */
12181
12182 void
12183 aarch64_declare_function_name (FILE *stream, const char* name,
12184                                 tree fndecl)
12185 {
12186   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12187
12188   struct cl_target_option *targ_options;
12189   if (target_parts)
12190     targ_options = TREE_TARGET_OPTION (target_parts);
12191   else
12192     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12193   gcc_assert (targ_options);
12194
12195   const struct processor *this_arch
12196     = aarch64_get_arch (targ_options->x_explicit_arch);
12197
12198   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12199   std::string extension
12200     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12201                                                   this_arch->flags);
12202   /* Only update the assembler .arch string if it is distinct from the last
12203      such string we printed.  */
12204   std::string to_print = this_arch->name + extension;
12205   if (to_print != aarch64_last_printed_arch_string)
12206     {
12207       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12208       aarch64_last_printed_arch_string = to_print;
12209     }
12210
12211   /* Print the cpu name we're tuning for in the comments, might be
12212      useful to readers of the generated asm.  Do it only when it changes
12213      from function to function and verbose assembly is requested.  */
12214   const struct processor *this_tune
12215     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12216
12217   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12218     {
12219       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12220                    this_tune->name);
12221       aarch64_last_printed_tune_string = this_tune->name;
12222     }
12223
12224   /* Don't forget the type directive for ELF.  */
12225   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12226   ASM_OUTPUT_LABEL (stream, name);
12227 }
12228
12229 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12230
12231 static void
12232 aarch64_start_file (void)
12233 {
12234   struct cl_target_option *default_options
12235     = TREE_TARGET_OPTION (target_option_default_node);
12236
12237   const struct processor *default_arch
12238     = aarch64_get_arch (default_options->x_explicit_arch);
12239   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12240   std::string extension
12241     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12242                                                   default_arch->flags);
12243
12244    aarch64_last_printed_arch_string = default_arch->name + extension;
12245    aarch64_last_printed_tune_string = "";
12246    asm_fprintf (asm_out_file, "\t.arch %s\n",
12247                 aarch64_last_printed_arch_string.c_str ());
12248
12249    default_file_start ();
12250 }
12251
12252 /* Emit load exclusive.  */
12253
12254 static void
12255 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12256                              rtx mem, rtx model_rtx)
12257 {
12258   rtx (*gen) (rtx, rtx, rtx);
12259
12260   switch (mode)
12261     {
12262     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12263     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12264     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12265     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12266     default:
12267       gcc_unreachable ();
12268     }
12269
12270   emit_insn (gen (rval, mem, model_rtx));
12271 }
12272
12273 /* Emit store exclusive.  */
12274
12275 static void
12276 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12277                               rtx rval, rtx mem, rtx model_rtx)
12278 {
12279   rtx (*gen) (rtx, rtx, rtx, rtx);
12280
12281   switch (mode)
12282     {
12283     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12284     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12285     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12286     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12287     default:
12288       gcc_unreachable ();
12289     }
12290
12291   emit_insn (gen (bval, rval, mem, model_rtx));
12292 }
12293
12294 /* Mark the previous jump instruction as unlikely.  */
12295
12296 static void
12297 aarch64_emit_unlikely_jump (rtx insn)
12298 {
12299   rtx_insn *jump = emit_jump_insn (insn);
12300   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12301 }
12302
12303 /* Expand a compare and swap pattern.  */
12304
12305 void
12306 aarch64_expand_compare_and_swap (rtx operands[])
12307 {
12308   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12309   machine_mode mode, cmp_mode;
12310   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12311   int idx;
12312   gen_cas_fn gen;
12313   const gen_cas_fn split_cas[] =
12314   {
12315     gen_aarch64_compare_and_swapqi,
12316     gen_aarch64_compare_and_swaphi,
12317     gen_aarch64_compare_and_swapsi,
12318     gen_aarch64_compare_and_swapdi
12319   };
12320   const gen_cas_fn atomic_cas[] =
12321   {
12322     gen_aarch64_compare_and_swapqi_lse,
12323     gen_aarch64_compare_and_swaphi_lse,
12324     gen_aarch64_compare_and_swapsi_lse,
12325     gen_aarch64_compare_and_swapdi_lse
12326   };
12327
12328   bval = operands[0];
12329   rval = operands[1];
12330   mem = operands[2];
12331   oldval = operands[3];
12332   newval = operands[4];
12333   is_weak = operands[5];
12334   mod_s = operands[6];
12335   mod_f = operands[7];
12336   mode = GET_MODE (mem);
12337   cmp_mode = mode;
12338
12339   /* Normally the succ memory model must be stronger than fail, but in the
12340      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12341      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12342
12343   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12344       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12345     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12346
12347   switch (mode)
12348     {
12349     case E_QImode:
12350     case E_HImode:
12351       /* For short modes, we're going to perform the comparison in SImode,
12352          so do the zero-extension now.  */
12353       cmp_mode = SImode;
12354       rval = gen_reg_rtx (SImode);
12355       oldval = convert_modes (SImode, mode, oldval, true);
12356       /* Fall through.  */
12357
12358     case E_SImode:
12359     case E_DImode:
12360       /* Force the value into a register if needed.  */
12361       if (!aarch64_plus_operand (oldval, mode))
12362         oldval = force_reg (cmp_mode, oldval);
12363       break;
12364
12365     default:
12366       gcc_unreachable ();
12367     }
12368
12369   switch (mode)
12370     {
12371     case E_QImode: idx = 0; break;
12372     case E_HImode: idx = 1; break;
12373     case E_SImode: idx = 2; break;
12374     case E_DImode: idx = 3; break;
12375     default:
12376       gcc_unreachable ();
12377     }
12378   if (TARGET_LSE)
12379     gen = atomic_cas[idx];
12380   else
12381     gen = split_cas[idx];
12382
12383   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12384
12385   if (mode == QImode || mode == HImode)
12386     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12387
12388   x = gen_rtx_REG (CCmode, CC_REGNUM);
12389   x = gen_rtx_EQ (SImode, x, const0_rtx);
12390   emit_insn (gen_rtx_SET (bval, x));
12391 }
12392
12393 /* Test whether the target supports using a atomic load-operate instruction.
12394    CODE is the operation and AFTER is TRUE if the data in memory after the
12395    operation should be returned and FALSE if the data before the operation
12396    should be returned.  Returns FALSE if the operation isn't supported by the
12397    architecture.  */
12398
12399 bool
12400 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12401 {
12402   if (!TARGET_LSE)
12403     return false;
12404
12405   switch (code)
12406     {
12407     case SET:
12408     case AND:
12409     case IOR:
12410     case XOR:
12411     case MINUS:
12412     case PLUS:
12413       return true;
12414     default:
12415       return false;
12416     }
12417 }
12418
12419 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12420    sequence implementing an atomic operation.  */
12421
12422 static void
12423 aarch64_emit_post_barrier (enum memmodel model)
12424 {
12425   const enum memmodel base_model = memmodel_base (model);
12426
12427   if (is_mm_sync (model)
12428       && (base_model == MEMMODEL_ACQUIRE
12429           || base_model == MEMMODEL_ACQ_REL
12430           || base_model == MEMMODEL_SEQ_CST))
12431     {
12432       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12433     }
12434 }
12435
12436 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12437    for the data in memory.  EXPECTED is the value expected to be in memory.
12438    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12439    is the memory ordering to use.  */
12440
12441 void
12442 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12443                         rtx expected, rtx desired,
12444                         rtx model)
12445 {
12446   rtx (*gen) (rtx, rtx, rtx, rtx);
12447   machine_mode mode;
12448
12449   mode = GET_MODE (mem);
12450
12451   switch (mode)
12452     {
12453     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12454     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12455     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12456     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12457     default:
12458       gcc_unreachable ();
12459     }
12460
12461   /* Move the expected value into the CAS destination register.  */
12462   emit_insn (gen_rtx_SET (rval, expected));
12463
12464   /* Emit the CAS.  */
12465   emit_insn (gen (rval, mem, desired, model));
12466
12467   /* Compare the expected value with the value loaded by the CAS, to establish
12468      whether the swap was made.  */
12469   aarch64_gen_compare_reg (EQ, rval, expected);
12470 }
12471
12472 /* Split a compare and swap pattern.  */
12473
12474 void
12475 aarch64_split_compare_and_swap (rtx operands[])
12476 {
12477   rtx rval, mem, oldval, newval, scratch;
12478   machine_mode mode;
12479   bool is_weak;
12480   rtx_code_label *label1, *label2;
12481   rtx x, cond;
12482   enum memmodel model;
12483   rtx model_rtx;
12484
12485   rval = operands[0];
12486   mem = operands[1];
12487   oldval = operands[2];
12488   newval = operands[3];
12489   is_weak = (operands[4] != const0_rtx);
12490   model_rtx = operands[5];
12491   scratch = operands[7];
12492   mode = GET_MODE (mem);
12493   model = memmodel_from_int (INTVAL (model_rtx));
12494
12495   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12496     loop:
12497     .label1:
12498         LD[A]XR rval, [mem]
12499         CBNZ    rval, .label2
12500         ST[L]XR scratch, newval, [mem]
12501         CBNZ    scratch, .label1
12502     .label2:
12503         CMP     rval, 0.  */
12504   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12505
12506   label1 = NULL;
12507   if (!is_weak)
12508     {
12509       label1 = gen_label_rtx ();
12510       emit_label (label1);
12511     }
12512   label2 = gen_label_rtx ();
12513
12514   /* The initial load can be relaxed for a __sync operation since a final
12515      barrier will be emitted to stop code hoisting.  */
12516   if (is_mm_sync (model))
12517     aarch64_emit_load_exclusive (mode, rval, mem,
12518                                  GEN_INT (MEMMODEL_RELAXED));
12519   else
12520     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12521
12522   if (strong_zero_p)
12523     {
12524       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12525       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12526                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12527       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12528     }
12529   else
12530     {
12531       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12532       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12533       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12534                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12535       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12536     }
12537
12538   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12539
12540   if (!is_weak)
12541     {
12542       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12543       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12544                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12545       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12546     }
12547   else
12548     {
12549       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12550       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12551       emit_insn (gen_rtx_SET (cond, x));
12552     }
12553
12554   emit_label (label2);
12555   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12556      to set the condition flags.  If this is not used it will be removed by
12557      later passes.  */
12558   if (strong_zero_p)
12559     {
12560       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12561       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12562       emit_insn (gen_rtx_SET (cond, x));
12563     }
12564   /* Emit any final barrier needed for a __sync operation.  */
12565   if (is_mm_sync (model))
12566     aarch64_emit_post_barrier (model);
12567 }
12568
12569 /* Emit a BIC instruction.  */
12570
12571 static void
12572 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12573 {
12574   rtx shift_rtx = GEN_INT (shift);
12575   rtx (*gen) (rtx, rtx, rtx, rtx);
12576
12577   switch (mode)
12578     {
12579     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12580     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12581     default:
12582       gcc_unreachable ();
12583     }
12584
12585   emit_insn (gen (dst, s2, shift_rtx, s1));
12586 }
12587
12588 /* Emit an atomic swap.  */
12589
12590 static void
12591 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12592                           rtx mem, rtx model)
12593 {
12594   rtx (*gen) (rtx, rtx, rtx, rtx);
12595
12596   switch (mode)
12597     {
12598     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12599     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12600     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12601     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12602     default:
12603       gcc_unreachable ();
12604     }
12605
12606   emit_insn (gen (dst, mem, value, model));
12607 }
12608
12609 /* Operations supported by aarch64_emit_atomic_load_op.  */
12610
12611 enum aarch64_atomic_load_op_code
12612 {
12613   AARCH64_LDOP_PLUS,    /* A + B  */
12614   AARCH64_LDOP_XOR,     /* A ^ B  */
12615   AARCH64_LDOP_OR,      /* A | B  */
12616   AARCH64_LDOP_BIC      /* A & ~B  */
12617 };
12618
12619 /* Emit an atomic load-operate.  */
12620
12621 static void
12622 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12623                              machine_mode mode, rtx dst, rtx src,
12624                              rtx mem, rtx model)
12625 {
12626   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12627   const aarch64_atomic_load_op_fn plus[] =
12628   {
12629     gen_aarch64_atomic_loadaddqi,
12630     gen_aarch64_atomic_loadaddhi,
12631     gen_aarch64_atomic_loadaddsi,
12632     gen_aarch64_atomic_loadadddi
12633   };
12634   const aarch64_atomic_load_op_fn eor[] =
12635   {
12636     gen_aarch64_atomic_loadeorqi,
12637     gen_aarch64_atomic_loadeorhi,
12638     gen_aarch64_atomic_loadeorsi,
12639     gen_aarch64_atomic_loadeordi
12640   };
12641   const aarch64_atomic_load_op_fn ior[] =
12642   {
12643     gen_aarch64_atomic_loadsetqi,
12644     gen_aarch64_atomic_loadsethi,
12645     gen_aarch64_atomic_loadsetsi,
12646     gen_aarch64_atomic_loadsetdi
12647   };
12648   const aarch64_atomic_load_op_fn bic[] =
12649   {
12650     gen_aarch64_atomic_loadclrqi,
12651     gen_aarch64_atomic_loadclrhi,
12652     gen_aarch64_atomic_loadclrsi,
12653     gen_aarch64_atomic_loadclrdi
12654   };
12655   aarch64_atomic_load_op_fn gen;
12656   int idx = 0;
12657
12658   switch (mode)
12659     {
12660     case E_QImode: idx = 0; break;
12661     case E_HImode: idx = 1; break;
12662     case E_SImode: idx = 2; break;
12663     case E_DImode: idx = 3; break;
12664     default:
12665       gcc_unreachable ();
12666     }
12667
12668   switch (code)
12669     {
12670     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12671     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12672     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12673     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12674     default:
12675       gcc_unreachable ();
12676     }
12677
12678   emit_insn (gen (dst, mem, src, model));
12679 }
12680
12681 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12682    location to store the data read from memory.  OUT_RESULT is the location to
12683    store the result of the operation.  MEM is the memory location to read and
12684    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12685    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12686    be NULL.  */
12687
12688 void
12689 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12690                          rtx mem, rtx value, rtx model_rtx)
12691 {
12692   machine_mode mode = GET_MODE (mem);
12693   machine_mode wmode = (mode == DImode ? DImode : SImode);
12694   const bool short_mode = (mode < SImode);
12695   aarch64_atomic_load_op_code ldop_code;
12696   rtx src;
12697   rtx x;
12698
12699   if (out_data)
12700     out_data = gen_lowpart (mode, out_data);
12701
12702   if (out_result)
12703     out_result = gen_lowpart (mode, out_result);
12704
12705   /* Make sure the value is in a register, putting it into a destination
12706      register if it needs to be manipulated.  */
12707   if (!register_operand (value, mode)
12708       || code == AND || code == MINUS)
12709     {
12710       src = out_result ? out_result : out_data;
12711       emit_move_insn (src, gen_lowpart (mode, value));
12712     }
12713   else
12714     src = value;
12715   gcc_assert (register_operand (src, mode));
12716
12717   /* Preprocess the data for the operation as necessary.  If the operation is
12718      a SET then emit a swap instruction and finish.  */
12719   switch (code)
12720     {
12721     case SET:
12722       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12723       return;
12724
12725     case MINUS:
12726       /* Negate the value and treat it as a PLUS.  */
12727       {
12728         rtx neg_src;
12729
12730         /* Resize the value if necessary.  */
12731         if (short_mode)
12732           src = gen_lowpart (wmode, src);
12733
12734         neg_src = gen_rtx_NEG (wmode, src);
12735         emit_insn (gen_rtx_SET (src, neg_src));
12736
12737         if (short_mode)
12738           src = gen_lowpart (mode, src);
12739       }
12740       /* Fall-through.  */
12741     case PLUS:
12742       ldop_code = AARCH64_LDOP_PLUS;
12743       break;
12744
12745     case IOR:
12746       ldop_code = AARCH64_LDOP_OR;
12747       break;
12748
12749     case XOR:
12750       ldop_code = AARCH64_LDOP_XOR;
12751       break;
12752
12753     case AND:
12754       {
12755         rtx not_src;
12756
12757         /* Resize the value if necessary.  */
12758         if (short_mode)
12759           src = gen_lowpart (wmode, src);
12760
12761         not_src = gen_rtx_NOT (wmode, src);
12762         emit_insn (gen_rtx_SET (src, not_src));
12763
12764         if (short_mode)
12765           src = gen_lowpart (mode, src);
12766       }
12767       ldop_code = AARCH64_LDOP_BIC;
12768       break;
12769
12770     default:
12771       /* The operation can't be done with atomic instructions.  */
12772       gcc_unreachable ();
12773     }
12774
12775   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12776
12777   /* If necessary, calculate the data in memory after the update by redoing the
12778      operation from values in registers.  */
12779   if (!out_result)
12780     return;
12781
12782   if (short_mode)
12783     {
12784       src = gen_lowpart (wmode, src);
12785       out_data = gen_lowpart (wmode, out_data);
12786       out_result = gen_lowpart (wmode, out_result);
12787     }
12788
12789   x = NULL_RTX;
12790
12791   switch (code)
12792     {
12793     case MINUS:
12794     case PLUS:
12795       x = gen_rtx_PLUS (wmode, out_data, src);
12796       break;
12797     case IOR:
12798       x = gen_rtx_IOR (wmode, out_data, src);
12799       break;
12800     case XOR:
12801       x = gen_rtx_XOR (wmode, out_data, src);
12802       break;
12803     case AND:
12804       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12805       return;
12806     default:
12807       gcc_unreachable ();
12808     }
12809
12810   emit_set_insn (out_result, x);
12811
12812   return;
12813 }
12814
12815 /* Split an atomic operation.  */
12816
12817 void
12818 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12819                          rtx value, rtx model_rtx, rtx cond)
12820 {
12821   machine_mode mode = GET_MODE (mem);
12822   machine_mode wmode = (mode == DImode ? DImode : SImode);
12823   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12824   const bool is_sync = is_mm_sync (model);
12825   rtx_code_label *label;
12826   rtx x;
12827
12828   /* Split the atomic operation into a sequence.  */
12829   label = gen_label_rtx ();
12830   emit_label (label);
12831
12832   if (new_out)
12833     new_out = gen_lowpart (wmode, new_out);
12834   if (old_out)
12835     old_out = gen_lowpart (wmode, old_out);
12836   else
12837     old_out = new_out;
12838   value = simplify_gen_subreg (wmode, value, mode, 0);
12839
12840   /* The initial load can be relaxed for a __sync operation since a final
12841      barrier will be emitted to stop code hoisting.  */
12842  if (is_sync)
12843     aarch64_emit_load_exclusive (mode, old_out, mem,
12844                                  GEN_INT (MEMMODEL_RELAXED));
12845   else
12846     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12847
12848   switch (code)
12849     {
12850     case SET:
12851       new_out = value;
12852       break;
12853
12854     case NOT:
12855       x = gen_rtx_AND (wmode, old_out, value);
12856       emit_insn (gen_rtx_SET (new_out, x));
12857       x = gen_rtx_NOT (wmode, new_out);
12858       emit_insn (gen_rtx_SET (new_out, x));
12859       break;
12860
12861     case MINUS:
12862       if (CONST_INT_P (value))
12863         {
12864           value = GEN_INT (-INTVAL (value));
12865           code = PLUS;
12866         }
12867       /* Fall through.  */
12868
12869     default:
12870       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12871       emit_insn (gen_rtx_SET (new_out, x));
12872       break;
12873     }
12874
12875   aarch64_emit_store_exclusive (mode, cond, mem,
12876                                 gen_lowpart (mode, new_out), model_rtx);
12877
12878   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12879   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12880                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12881   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12882
12883   /* Emit any final barrier needed for a __sync operation.  */
12884   if (is_sync)
12885     aarch64_emit_post_barrier (model);
12886 }
12887
12888 static void
12889 aarch64_init_libfuncs (void)
12890 {
12891    /* Half-precision float operations.  The compiler handles all operations
12892      with NULL libfuncs by converting to SFmode.  */
12893
12894   /* Conversions.  */
12895   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12896   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12897
12898   /* Arithmetic.  */
12899   set_optab_libfunc (add_optab, HFmode, NULL);
12900   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12901   set_optab_libfunc (smul_optab, HFmode, NULL);
12902   set_optab_libfunc (neg_optab, HFmode, NULL);
12903   set_optab_libfunc (sub_optab, HFmode, NULL);
12904
12905   /* Comparisons.  */
12906   set_optab_libfunc (eq_optab, HFmode, NULL);
12907   set_optab_libfunc (ne_optab, HFmode, NULL);
12908   set_optab_libfunc (lt_optab, HFmode, NULL);
12909   set_optab_libfunc (le_optab, HFmode, NULL);
12910   set_optab_libfunc (ge_optab, HFmode, NULL);
12911   set_optab_libfunc (gt_optab, HFmode, NULL);
12912   set_optab_libfunc (unord_optab, HFmode, NULL);
12913 }
12914
12915 /* Target hook for c_mode_for_suffix.  */
12916 static machine_mode
12917 aarch64_c_mode_for_suffix (char suffix)
12918 {
12919   if (suffix == 'q')
12920     return TFmode;
12921
12922   return VOIDmode;
12923 }
12924
12925 /* We can only represent floating point constants which will fit in
12926    "quarter-precision" values.  These values are characterised by
12927    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12928    by:
12929
12930    (-1)^s * (n/16) * 2^r
12931
12932    Where:
12933      's' is the sign bit.
12934      'n' is an integer in the range 16 <= n <= 31.
12935      'r' is an integer in the range -3 <= r <= 4.  */
12936
12937 /* Return true iff X can be represented by a quarter-precision
12938    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12939 bool
12940 aarch64_float_const_representable_p (rtx x)
12941 {
12942   /* This represents our current view of how many bits
12943      make up the mantissa.  */
12944   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12945   int exponent;
12946   unsigned HOST_WIDE_INT mantissa, mask;
12947   REAL_VALUE_TYPE r, m;
12948   bool fail;
12949
12950   if (!CONST_DOUBLE_P (x))
12951     return false;
12952
12953   /* We don't support HFmode constants yet.  */
12954   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12955     return false;
12956
12957   r = *CONST_DOUBLE_REAL_VALUE (x);
12958
12959   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12960      know if we have +zero until we analyse the mantissa, but we
12961      can reject the other invalid values.  */
12962   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12963       || REAL_VALUE_MINUS_ZERO (r))
12964     return false;
12965
12966   /* Extract exponent.  */
12967   r = real_value_abs (&r);
12968   exponent = REAL_EXP (&r);
12969
12970   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12971      highest (sign) bit, with a fixed binary point at bit point_pos.
12972      m1 holds the low part of the mantissa, m2 the high part.
12973      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12974      bits for the mantissa, this can fail (low bits will be lost).  */
12975   real_ldexp (&m, &r, point_pos - exponent);
12976   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12977
12978   /* If the low part of the mantissa has bits set we cannot represent
12979      the value.  */
12980   if (w.ulow () != 0)
12981     return false;
12982   /* We have rejected the lower HOST_WIDE_INT, so update our
12983      understanding of how many bits lie in the mantissa and
12984      look only at the high HOST_WIDE_INT.  */
12985   mantissa = w.elt (1);
12986   point_pos -= HOST_BITS_PER_WIDE_INT;
12987
12988   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12989   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12990   if ((mantissa & mask) != 0)
12991     return false;
12992
12993   /* Having filtered unrepresentable values, we may now remove all
12994      but the highest 5 bits.  */
12995   mantissa >>= point_pos - 5;
12996
12997   /* We cannot represent the value 0.0, so reject it.  This is handled
12998      elsewhere.  */
12999   if (mantissa == 0)
13000     return false;
13001
13002   /* Then, as bit 4 is always set, we can mask it off, leaving
13003      the mantissa in the range [0, 15].  */
13004   mantissa &= ~(1 << 4);
13005   gcc_assert (mantissa <= 15);
13006
13007   /* GCC internally does not use IEEE754-like encoding (where normalized
13008      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
13009      Our mantissa values are shifted 4 places to the left relative to
13010      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
13011      by 5 places to correct for GCC's representation.  */
13012   exponent = 5 - exponent;
13013
13014   return (exponent >= 0 && exponent <= 7);
13015 }
13016
13017 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
13018    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
13019    output MOVI/MVNI, ORR or BIC immediate.  */
13020 char*
13021 aarch64_output_simd_mov_immediate (rtx const_vector,
13022                                    machine_mode mode,
13023                                    unsigned width,
13024                                    enum simd_immediate_check which)
13025 {
13026   bool is_valid;
13027   static char templ[40];
13028   const char *mnemonic;
13029   const char *shift_op;
13030   unsigned int lane_count = 0;
13031   char element_char;
13032
13033   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13034
13035   /* This will return true to show const_vector is legal for use as either
13036      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
13037      It will also update INFO to show how the immediate should be generated.
13038      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
13039   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false,
13040                                            &info, which);
13041   gcc_assert (is_valid);
13042
13043   element_char = sizetochar (info.element_width);
13044   lane_count = width / info.element_width;
13045
13046   mode = GET_MODE_INNER (mode);
13047   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13048     {
13049       gcc_assert (info.shift == 0 && ! info.mvn);
13050       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13051          move immediate path.  */
13052       if (aarch64_float_const_zero_rtx_p (info.value))
13053         info.value = GEN_INT (0);
13054       else
13055         {
13056           const unsigned int buf_size = 20;
13057           char float_buf[buf_size] = {'\0'};
13058           real_to_decimal_for_mode (float_buf,
13059                                     CONST_DOUBLE_REAL_VALUE (info.value),
13060                                     buf_size, buf_size, 1, mode);
13061
13062           if (lane_count == 1)
13063             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13064           else
13065             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13066                       lane_count, element_char, float_buf);
13067           return templ;
13068         }
13069     }
13070
13071   gcc_assert (CONST_INT_P (info.value));
13072
13073   if (which == AARCH64_CHECK_MOV)
13074     {
13075       mnemonic = info.mvn ? "mvni" : "movi";
13076       shift_op = info.msl ? "msl" : "lsl";
13077       if (lane_count == 1)
13078         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13079                   mnemonic, UINTVAL (info.value));
13080       else if (info.shift)
13081         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13082                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
13083                   element_char, UINTVAL (info.value), shift_op, info.shift);
13084       else
13085         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
13086                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
13087                   element_char, UINTVAL (info.value));
13088     }
13089   else
13090     {
13091       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
13092       mnemonic = info.mvn ? "bic" : "orr";
13093       if (info.shift)
13094         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13095                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
13096                   element_char, UINTVAL (info.value), "lsl", info.shift);
13097       else
13098         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
13099                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
13100                   element_char, UINTVAL (info.value));
13101     }
13102   return templ;
13103 }
13104
13105 char*
13106 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13107 {
13108
13109   /* If a floating point number was passed and we desire to use it in an
13110      integer mode do the conversion to integer.  */
13111   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13112     {
13113       unsigned HOST_WIDE_INT ival;
13114       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13115           gcc_unreachable ();
13116       immediate = gen_int_mode (ival, mode);
13117     }
13118
13119   machine_mode vmode;
13120   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13121      a 128 bit vector mode.  */
13122   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13123
13124   vmode = aarch64_simd_container_mode (mode, width);
13125   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13126   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13127 }
13128
13129 /* Split operands into moves from op[1] + op[2] into op[0].  */
13130
13131 void
13132 aarch64_split_combinev16qi (rtx operands[3])
13133 {
13134   unsigned int dest = REGNO (operands[0]);
13135   unsigned int src1 = REGNO (operands[1]);
13136   unsigned int src2 = REGNO (operands[2]);
13137   machine_mode halfmode = GET_MODE (operands[1]);
13138   unsigned int halfregs = REG_NREGS (operands[1]);
13139   rtx destlo, desthi;
13140
13141   gcc_assert (halfmode == V16QImode);
13142
13143   if (src1 == dest && src2 == dest + halfregs)
13144     {
13145       /* No-op move.  Can't split to nothing; emit something.  */
13146       emit_note (NOTE_INSN_DELETED);
13147       return;
13148     }
13149
13150   /* Preserve register attributes for variable tracking.  */
13151   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13152   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13153                                GET_MODE_SIZE (halfmode));
13154
13155   /* Special case of reversed high/low parts.  */
13156   if (reg_overlap_mentioned_p (operands[2], destlo)
13157       && reg_overlap_mentioned_p (operands[1], desthi))
13158     {
13159       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13160       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13161       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13162     }
13163   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13164     {
13165       /* Try to avoid unnecessary moves if part of the result
13166          is in the right place already.  */
13167       if (src1 != dest)
13168         emit_move_insn (destlo, operands[1]);
13169       if (src2 != dest + halfregs)
13170         emit_move_insn (desthi, operands[2]);
13171     }
13172   else
13173     {
13174       if (src2 != dest + halfregs)
13175         emit_move_insn (desthi, operands[2]);
13176       if (src1 != dest)
13177         emit_move_insn (destlo, operands[1]);
13178     }
13179 }
13180
13181 /* vec_perm support.  */
13182
13183 #define MAX_VECT_LEN 16
13184
13185 struct expand_vec_perm_d
13186 {
13187   rtx target, op0, op1;
13188   auto_vec_perm_indices perm;
13189   machine_mode vmode;
13190   bool one_vector_p;
13191   bool testing_p;
13192 };
13193
13194 /* Generate a variable permutation.  */
13195
13196 static void
13197 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13198 {
13199   machine_mode vmode = GET_MODE (target);
13200   bool one_vector_p = rtx_equal_p (op0, op1);
13201
13202   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13203   gcc_checking_assert (GET_MODE (op0) == vmode);
13204   gcc_checking_assert (GET_MODE (op1) == vmode);
13205   gcc_checking_assert (GET_MODE (sel) == vmode);
13206   gcc_checking_assert (TARGET_SIMD);
13207
13208   if (one_vector_p)
13209     {
13210       if (vmode == V8QImode)
13211         {
13212           /* Expand the argument to a V16QI mode by duplicating it.  */
13213           rtx pair = gen_reg_rtx (V16QImode);
13214           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13215           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13216         }
13217       else
13218         {
13219           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13220         }
13221     }
13222   else
13223     {
13224       rtx pair;
13225
13226       if (vmode == V8QImode)
13227         {
13228           pair = gen_reg_rtx (V16QImode);
13229           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13230           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13231         }
13232       else
13233         {
13234           pair = gen_reg_rtx (OImode);
13235           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13236           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13237         }
13238     }
13239 }
13240
13241 void
13242 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13243 {
13244   machine_mode vmode = GET_MODE (target);
13245   unsigned int nelt = GET_MODE_NUNITS (vmode);
13246   bool one_vector_p = rtx_equal_p (op0, op1);
13247   rtx mask;
13248
13249   /* The TBL instruction does not use a modulo index, so we must take care
13250      of that ourselves.  */
13251   mask = aarch64_simd_gen_const_vector_dup (vmode,
13252       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13253   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13254
13255   /* For big-endian, we also need to reverse the index within the vector
13256      (but not which vector).  */
13257   if (BYTES_BIG_ENDIAN)
13258     {
13259       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13260       if (!one_vector_p)
13261         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13262       sel = expand_simple_binop (vmode, XOR, sel, mask,
13263                                  NULL, 0, OPTAB_LIB_WIDEN);
13264     }
13265   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13266 }
13267
13268 /* Recognize patterns suitable for the TRN instructions.  */
13269 static bool
13270 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13271 {
13272   unsigned int i, odd, mask, nelt = d->perm.length ();
13273   rtx out, in0, in1, x;
13274   rtx (*gen) (rtx, rtx, rtx);
13275   machine_mode vmode = d->vmode;
13276
13277   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13278     return false;
13279
13280   /* Note that these are little-endian tests.
13281      We correct for big-endian later.  */
13282   if (d->perm[0] == 0)
13283     odd = 0;
13284   else if (d->perm[0] == 1)
13285     odd = 1;
13286   else
13287     return false;
13288   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13289
13290   for (i = 0; i < nelt; i += 2)
13291     {
13292       if (d->perm[i] != i + odd)
13293         return false;
13294       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13295         return false;
13296     }
13297
13298   /* Success!  */
13299   if (d->testing_p)
13300     return true;
13301
13302   in0 = d->op0;
13303   in1 = d->op1;
13304   if (BYTES_BIG_ENDIAN)
13305     {
13306       x = in0, in0 = in1, in1 = x;
13307       odd = !odd;
13308     }
13309   out = d->target;
13310
13311   if (odd)
13312     {
13313       switch (vmode)
13314         {
13315         case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13316         case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13317         case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13318         case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13319         case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13320         case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13321         case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13322         case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13323         case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13324         case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13325         case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13326         case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13327         default:
13328           return false;
13329         }
13330     }
13331   else
13332     {
13333       switch (vmode)
13334         {
13335         case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13336         case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13337         case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13338         case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13339         case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13340         case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13341         case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13342         case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13343         case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13344         case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13345         case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13346         case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13347         default:
13348           return false;
13349         }
13350     }
13351
13352   emit_insn (gen (out, in0, in1));
13353   return true;
13354 }
13355
13356 /* Recognize patterns suitable for the UZP instructions.  */
13357 static bool
13358 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13359 {
13360   unsigned int i, odd, mask, nelt = d->perm.length ();
13361   rtx out, in0, in1, x;
13362   rtx (*gen) (rtx, rtx, rtx);
13363   machine_mode vmode = d->vmode;
13364
13365   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13366     return false;
13367
13368   /* Note that these are little-endian tests.
13369      We correct for big-endian later.  */
13370   if (d->perm[0] == 0)
13371     odd = 0;
13372   else if (d->perm[0] == 1)
13373     odd = 1;
13374   else
13375     return false;
13376   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13377
13378   for (i = 0; i < nelt; i++)
13379     {
13380       unsigned elt = (i * 2 + odd) & mask;
13381       if (d->perm[i] != elt)
13382         return false;
13383     }
13384
13385   /* Success!  */
13386   if (d->testing_p)
13387     return true;
13388
13389   in0 = d->op0;
13390   in1 = d->op1;
13391   if (BYTES_BIG_ENDIAN)
13392     {
13393       x = in0, in0 = in1, in1 = x;
13394       odd = !odd;
13395     }
13396   out = d->target;
13397
13398   if (odd)
13399     {
13400       switch (vmode)
13401         {
13402         case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13403         case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13404         case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13405         case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13406         case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13407         case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13408         case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13409         case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13410         case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13411         case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13412         case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13413         case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13414         default:
13415           return false;
13416         }
13417     }
13418   else
13419     {
13420       switch (vmode)
13421         {
13422         case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13423         case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13424         case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13425         case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13426         case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13427         case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13428         case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13429         case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13430         case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13431         case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13432         case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13433         case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13434         default:
13435           return false;
13436         }
13437     }
13438
13439   emit_insn (gen (out, in0, in1));
13440   return true;
13441 }
13442
13443 /* Recognize patterns suitable for the ZIP instructions.  */
13444 static bool
13445 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13446 {
13447   unsigned int i, high, mask, nelt = d->perm.length ();
13448   rtx out, in0, in1, x;
13449   rtx (*gen) (rtx, rtx, rtx);
13450   machine_mode vmode = d->vmode;
13451
13452   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13453     return false;
13454
13455   /* Note that these are little-endian tests.
13456      We correct for big-endian later.  */
13457   high = nelt / 2;
13458   if (d->perm[0] == high)
13459     /* Do Nothing.  */
13460     ;
13461   else if (d->perm[0] == 0)
13462     high = 0;
13463   else
13464     return false;
13465   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13466
13467   for (i = 0; i < nelt / 2; i++)
13468     {
13469       unsigned elt = (i + high) & mask;
13470       if (d->perm[i * 2] != elt)
13471         return false;
13472       elt = (elt + nelt) & mask;
13473       if (d->perm[i * 2 + 1] != elt)
13474         return false;
13475     }
13476
13477   /* Success!  */
13478   if (d->testing_p)
13479     return true;
13480
13481   in0 = d->op0;
13482   in1 = d->op1;
13483   if (BYTES_BIG_ENDIAN)
13484     {
13485       x = in0, in0 = in1, in1 = x;
13486       high = !high;
13487     }
13488   out = d->target;
13489
13490   if (high)
13491     {
13492       switch (vmode)
13493         {
13494         case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13495         case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13496         case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13497         case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13498         case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13499         case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13500         case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13501         case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13502         case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13503         case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13504         case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13505         case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13506         default:
13507           return false;
13508         }
13509     }
13510   else
13511     {
13512       switch (vmode)
13513         {
13514         case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13515         case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13516         case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13517         case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13518         case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13519         case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13520         case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13521         case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13522         case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13523         case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13524         case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13525         case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13526         default:
13527           return false;
13528         }
13529     }
13530
13531   emit_insn (gen (out, in0, in1));
13532   return true;
13533 }
13534
13535 /* Recognize patterns for the EXT insn.  */
13536
13537 static bool
13538 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13539 {
13540   unsigned int i, nelt = d->perm.length ();
13541   rtx (*gen) (rtx, rtx, rtx, rtx);
13542   rtx offset;
13543
13544   unsigned int location = d->perm[0]; /* Always < nelt.  */
13545
13546   /* Check if the extracted indices are increasing by one.  */
13547   for (i = 1; i < nelt; i++)
13548     {
13549       unsigned int required = location + i;
13550       if (d->one_vector_p)
13551         {
13552           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13553           required &= (nelt - 1);
13554         }
13555       if (d->perm[i] != required)
13556         return false;
13557     }
13558
13559   switch (d->vmode)
13560     {
13561     case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13562     case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13563     case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13564     case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13565     case E_V2SImode: gen = gen_aarch64_extv2si; break;
13566     case E_V4SImode: gen = gen_aarch64_extv4si; break;
13567     case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13568     case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13569     case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13570     case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13571     case E_V2DImode: gen = gen_aarch64_extv2di; break;
13572     case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13573     default:
13574       return false;
13575     }
13576
13577   /* Success! */
13578   if (d->testing_p)
13579     return true;
13580
13581   /* The case where (location == 0) is a no-op for both big- and little-endian,
13582      and is removed by the mid-end at optimization levels -O1 and higher.  */
13583
13584   if (BYTES_BIG_ENDIAN && (location != 0))
13585     {
13586       /* After setup, we want the high elements of the first vector (stored
13587          at the LSB end of the register), and the low elements of the second
13588          vector (stored at the MSB end of the register). So swap.  */
13589       std::swap (d->op0, d->op1);
13590       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13591       location = nelt - location;
13592     }
13593
13594   offset = GEN_INT (location);
13595   emit_insn (gen (d->target, d->op0, d->op1, offset));
13596   return true;
13597 }
13598
13599 /* Recognize patterns for the REV insns.  */
13600
13601 static bool
13602 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13603 {
13604   unsigned int i, j, diff, nelt = d->perm.length ();
13605   rtx (*gen) (rtx, rtx);
13606
13607   if (!d->one_vector_p)
13608     return false;
13609
13610   diff = d->perm[0];
13611   switch (diff)
13612     {
13613     case 7:
13614       switch (d->vmode)
13615         {
13616         case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13617         case E_V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13618         default:
13619           return false;
13620         }
13621       break;
13622     case 3:
13623       switch (d->vmode)
13624         {
13625         case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13626         case E_V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13627         case E_V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13628         case E_V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13629         default:
13630           return false;
13631         }
13632       break;
13633     case 1:
13634       switch (d->vmode)
13635         {
13636         case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13637         case E_V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13638         case E_V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13639         case E_V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13640         case E_V4SImode: gen = gen_aarch64_rev64v4si;  break;
13641         case E_V2SImode: gen = gen_aarch64_rev64v2si;  break;
13642         case E_V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13643         case E_V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13644         case E_V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13645         case E_V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13646         default:
13647           return false;
13648         }
13649       break;
13650     default:
13651       return false;
13652     }
13653
13654   for (i = 0; i < nelt ; i += diff + 1)
13655     for (j = 0; j <= diff; j += 1)
13656       {
13657         /* This is guaranteed to be true as the value of diff
13658            is 7, 3, 1 and we should have enough elements in the
13659            queue to generate this.  Getting a vector mask with a
13660            value of diff other than these values implies that
13661            something is wrong by the time we get here.  */
13662         gcc_assert (i + j < nelt);
13663         if (d->perm[i + j] != i + diff - j)
13664           return false;
13665       }
13666
13667   /* Success! */
13668   if (d->testing_p)
13669     return true;
13670
13671   emit_insn (gen (d->target, d->op0));
13672   return true;
13673 }
13674
13675 static bool
13676 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13677 {
13678   rtx (*gen) (rtx, rtx, rtx);
13679   rtx out = d->target;
13680   rtx in0;
13681   machine_mode vmode = d->vmode;
13682   unsigned int i, elt, nelt = d->perm.length ();
13683   rtx lane;
13684
13685   elt = d->perm[0];
13686   for (i = 1; i < nelt; i++)
13687     {
13688       if (elt != d->perm[i])
13689         return false;
13690     }
13691
13692   /* The generic preparation in aarch64_expand_vec_perm_const_1
13693      swaps the operand order and the permute indices if it finds
13694      d->perm[0] to be in the second operand.  Thus, we can always
13695      use d->op0 and need not do any extra arithmetic to get the
13696      correct lane number.  */
13697   in0 = d->op0;
13698   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13699
13700   switch (vmode)
13701     {
13702     case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13703     case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13704     case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13705     case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13706     case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13707     case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13708     case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13709     case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13710     case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13711     case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13712     case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13713     case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13714     default:
13715       return false;
13716     }
13717
13718   emit_insn (gen (out, in0, lane));
13719   return true;
13720 }
13721
13722 static bool
13723 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13724 {
13725   rtx rperm[MAX_VECT_LEN], sel;
13726   machine_mode vmode = d->vmode;
13727   unsigned int i, nelt = d->perm.length ();
13728
13729   if (d->testing_p)
13730     return true;
13731
13732   /* Generic code will try constant permutation twice.  Once with the
13733      original mode and again with the elements lowered to QImode.
13734      So wait and don't do the selector expansion ourselves.  */
13735   if (vmode != V8QImode && vmode != V16QImode)
13736     return false;
13737
13738   for (i = 0; i < nelt; ++i)
13739     {
13740       int nunits = GET_MODE_NUNITS (vmode);
13741
13742       /* If big-endian and two vectors we end up with a weird mixed-endian
13743          mode on NEON.  Reverse the index within each word but not the word
13744          itself.  */
13745       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13746                                            : d->perm[i]);
13747     }
13748   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13749   sel = force_reg (vmode, sel);
13750
13751   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13752   return true;
13753 }
13754
13755 static bool
13756 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13757 {
13758   /* The pattern matching functions above are written to look for a small
13759      number to begin the sequence (0, 1, N/2).  If we begin with an index
13760      from the second operand, we can swap the operands.  */
13761   unsigned int nelt = d->perm.length ();
13762   if (d->perm[0] >= nelt)
13763     {
13764       gcc_assert (nelt == (nelt & -nelt));
13765       for (unsigned int i = 0; i < nelt; ++i)
13766         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13767
13768       std::swap (d->op0, d->op1);
13769     }
13770
13771   if (TARGET_SIMD)
13772     {
13773       if (aarch64_evpc_rev (d))
13774         return true;
13775       else if (aarch64_evpc_ext (d))
13776         return true;
13777       else if (aarch64_evpc_dup (d))
13778         return true;
13779       else if (aarch64_evpc_zip (d))
13780         return true;
13781       else if (aarch64_evpc_uzp (d))
13782         return true;
13783       else if (aarch64_evpc_trn (d))
13784         return true;
13785       return aarch64_evpc_tbl (d);
13786     }
13787   return false;
13788 }
13789
13790 /* Expand a vec_perm_const pattern.  */
13791
13792 bool
13793 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13794 {
13795   struct expand_vec_perm_d d;
13796   int i, nelt, which;
13797
13798   d.target = target;
13799   d.op0 = op0;
13800   d.op1 = op1;
13801
13802   d.vmode = GET_MODE (target);
13803   gcc_assert (VECTOR_MODE_P (d.vmode));
13804   d.testing_p = false;
13805
13806   nelt = GET_MODE_NUNITS (d.vmode);
13807   d.perm.reserve (nelt);
13808   for (i = which = 0; i < nelt; ++i)
13809     {
13810       rtx e = XVECEXP (sel, 0, i);
13811       int ei = INTVAL (e) & (2 * nelt - 1);
13812       which |= (ei < nelt ? 1 : 2);
13813       d.perm.quick_push (ei);
13814     }
13815
13816   switch (which)
13817     {
13818     default:
13819       gcc_unreachable ();
13820
13821     case 3:
13822       d.one_vector_p = false;
13823       if (!rtx_equal_p (op0, op1))
13824         break;
13825
13826       /* The elements of PERM do not suggest that only the first operand
13827          is used, but both operands are identical.  Allow easier matching
13828          of the permutation by folding the permutation into the single
13829          input vector.  */
13830       /* Fall Through.  */
13831     case 2:
13832       for (i = 0; i < nelt; ++i)
13833         d.perm[i] &= nelt - 1;
13834       d.op0 = op1;
13835       d.one_vector_p = true;
13836       break;
13837
13838     case 1:
13839       d.op1 = op0;
13840       d.one_vector_p = true;
13841       break;
13842     }
13843
13844   return aarch64_expand_vec_perm_const_1 (&d);
13845 }
13846
13847 static bool
13848 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
13849 {
13850   struct expand_vec_perm_d d;
13851   unsigned int i, nelt, which;
13852   bool ret;
13853
13854   d.vmode = vmode;
13855   d.testing_p = true;
13856   d.perm.safe_splice (sel);
13857
13858   /* Calculate whether all elements are in one vector.  */
13859   nelt = sel.length ();
13860   for (i = which = 0; i < nelt; ++i)
13861     {
13862       unsigned int e = d.perm[i];
13863       gcc_assert (e < 2 * nelt);
13864       which |= (e < nelt ? 1 : 2);
13865     }
13866
13867   /* If all elements are from the second vector, reindex as if from the
13868      first vector.  */
13869   if (which == 2)
13870     for (i = 0; i < nelt; ++i)
13871       d.perm[i] -= nelt;
13872
13873   /* Check whether the mask can be applied to a single vector.  */
13874   d.one_vector_p = (which != 3);
13875
13876   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13877   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13878   if (!d.one_vector_p)
13879     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13880
13881   start_sequence ();
13882   ret = aarch64_expand_vec_perm_const_1 (&d);
13883   end_sequence ();
13884
13885   return ret;
13886 }
13887
13888 rtx
13889 aarch64_reverse_mask (machine_mode mode)
13890 {
13891   /* We have to reverse each vector because we dont have
13892      a permuted load that can reverse-load according to ABI rules.  */
13893   rtx mask;
13894   rtvec v = rtvec_alloc (16);
13895   int i, j;
13896   int nunits = GET_MODE_NUNITS (mode);
13897   int usize = GET_MODE_UNIT_SIZE (mode);
13898
13899   gcc_assert (BYTES_BIG_ENDIAN);
13900   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13901
13902   for (i = 0; i < nunits; i++)
13903     for (j = 0; j < usize; j++)
13904       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13905   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13906   return force_reg (V16QImode, mask);
13907 }
13908
13909 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
13910    true.  However due to issues with register allocation it is preferable
13911    to avoid tieing integer scalar and FP scalar modes.  Executing integer
13912    operations in general registers is better than treating them as scalar
13913    vector operations.  This reduces latency and avoids redundant int<->FP
13914    moves.  So tie modes if they are either the same class, or vector modes
13915    with other vector modes, vector structs or any scalar mode.  */
13916
13917 static bool
13918 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13919 {
13920   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13921     return true;
13922
13923   /* We specifically want to allow elements of "structure" modes to
13924      be tieable to the structure.  This more general condition allows
13925      other rarer situations too.  */
13926   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13927     return true;
13928
13929   /* Also allow any scalar modes with vectors.  */
13930   if (aarch64_vector_mode_supported_p (mode1)
13931       || aarch64_vector_mode_supported_p (mode2))
13932     return true;
13933
13934   return false;
13935 }
13936
13937 /* Return a new RTX holding the result of moving POINTER forward by
13938    AMOUNT bytes.  */
13939
13940 static rtx
13941 aarch64_move_pointer (rtx pointer, int amount)
13942 {
13943   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13944
13945   return adjust_automodify_address (pointer, GET_MODE (pointer),
13946                                     next, amount);
13947 }
13948
13949 /* Return a new RTX holding the result of moving POINTER forward by the
13950    size of the mode it points to.  */
13951
13952 static rtx
13953 aarch64_progress_pointer (rtx pointer)
13954 {
13955   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13956
13957   return aarch64_move_pointer (pointer, amount);
13958 }
13959
13960 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13961    MODE bytes.  */
13962
13963 static void
13964 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13965                                               machine_mode mode)
13966 {
13967   rtx reg = gen_reg_rtx (mode);
13968
13969   /* "Cast" the pointers to the correct mode.  */
13970   *src = adjust_address (*src, mode, 0);
13971   *dst = adjust_address (*dst, mode, 0);
13972   /* Emit the memcpy.  */
13973   emit_move_insn (reg, *src);
13974   emit_move_insn (*dst, reg);
13975   /* Move the pointers forward.  */
13976   *src = aarch64_progress_pointer (*src);
13977   *dst = aarch64_progress_pointer (*dst);
13978 }
13979
13980 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13981    we succeed, otherwise return false.  */
13982
13983 bool
13984 aarch64_expand_movmem (rtx *operands)
13985 {
13986   unsigned int n;
13987   rtx dst = operands[0];
13988   rtx src = operands[1];
13989   rtx base;
13990   bool speed_p = !optimize_function_for_size_p (cfun);
13991
13992   /* When optimizing for size, give a better estimate of the length of a
13993      memcpy call, but use the default otherwise.  */
13994   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13995
13996   /* We can't do anything smart if the amount to copy is not constant.  */
13997   if (!CONST_INT_P (operands[2]))
13998     return false;
13999
14000   n = UINTVAL (operands[2]);
14001
14002   /* Try to keep the number of instructions low.  For cases below 16 bytes we
14003      need to make at most two moves.  For cases above 16 bytes it will be one
14004      move for each 16 byte chunk, then at most two additional moves.  */
14005   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
14006     return false;
14007
14008   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
14009   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
14010
14011   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
14012   src = adjust_automodify_address (src, VOIDmode, base, 0);
14013
14014   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
14015      1-byte chunk.  */
14016   if (n < 4)
14017     {
14018       if (n >= 2)
14019         {
14020           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14021           n -= 2;
14022         }
14023
14024       if (n == 1)
14025         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14026
14027       return true;
14028     }
14029
14030   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
14031      4-byte chunk, partially overlapping with the previously copied chunk.  */
14032   if (n < 8)
14033     {
14034       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14035       n -= 4;
14036       if (n > 0)
14037         {
14038           int move = n - 4;
14039
14040           src = aarch64_move_pointer (src, move);
14041           dst = aarch64_move_pointer (dst, move);
14042           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14043         }
14044       return true;
14045     }
14046
14047   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
14048      them, then (if applicable) an 8-byte chunk.  */
14049   while (n >= 8)
14050     {
14051       if (n / 16)
14052         {
14053           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14054           n -= 16;
14055         }
14056       else
14057         {
14058           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14059           n -= 8;
14060         }
14061     }
14062
14063   /* Finish the final bytes of the copy.  We can always do this in one
14064      instruction.  We either copy the exact amount we need, or partially
14065      overlap with the previous chunk we copied and copy 8-bytes.  */
14066   if (n == 0)
14067     return true;
14068   else if (n == 1)
14069     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14070   else if (n == 2)
14071     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14072   else if (n == 4)
14073     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14074   else
14075     {
14076       if (n == 3)
14077         {
14078           src = aarch64_move_pointer (src, -1);
14079           dst = aarch64_move_pointer (dst, -1);
14080           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14081         }
14082       else
14083         {
14084           int move = n - 8;
14085
14086           src = aarch64_move_pointer (src, move);
14087           dst = aarch64_move_pointer (dst, move);
14088           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14089         }
14090     }
14091
14092   return true;
14093 }
14094
14095 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14096    SImode stores.  Handle the case when the constant has identical
14097    bottom and top halves.  This is beneficial when the two stores can be
14098    merged into an STP and we avoid synthesising potentially expensive
14099    immediates twice.  Return true if such a split is possible.  */
14100
14101 bool
14102 aarch64_split_dimode_const_store (rtx dst, rtx src)
14103 {
14104   rtx lo = gen_lowpart (SImode, src);
14105   rtx hi = gen_highpart_mode (SImode, DImode, src);
14106
14107   bool size_p = optimize_function_for_size_p (cfun);
14108
14109   if (!rtx_equal_p (lo, hi))
14110     return false;
14111
14112   unsigned int orig_cost
14113     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14114   unsigned int lo_cost
14115     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14116
14117   /* We want to transform:
14118      MOV        x1, 49370
14119      MOVK       x1, 0x140, lsl 16
14120      MOVK       x1, 0xc0da, lsl 32
14121      MOVK       x1, 0x140, lsl 48
14122      STR        x1, [x0]
14123    into:
14124      MOV        w1, 49370
14125      MOVK       w1, 0x140, lsl 16
14126      STP        w1, w1, [x0]
14127    So we want to perform this only when we save two instructions
14128    or more.  When optimizing for size, however, accept any code size
14129    savings we can.  */
14130   if (size_p && orig_cost <= lo_cost)
14131     return false;
14132
14133   if (!size_p
14134       && (orig_cost <= lo_cost + 1))
14135     return false;
14136
14137   rtx mem_lo = adjust_address (dst, SImode, 0);
14138   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14139     return false;
14140
14141   rtx tmp_reg = gen_reg_rtx (SImode);
14142   aarch64_expand_mov_immediate (tmp_reg, lo);
14143   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14144   /* Don't emit an explicit store pair as this may not be always profitable.
14145      Let the sched-fusion logic decide whether to merge them.  */
14146   emit_move_insn (mem_lo, tmp_reg);
14147   emit_move_insn (mem_hi, tmp_reg);
14148
14149   return true;
14150 }
14151
14152 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14153
14154 static unsigned HOST_WIDE_INT
14155 aarch64_asan_shadow_offset (void)
14156 {
14157   return (HOST_WIDE_INT_1 << 36);
14158 }
14159
14160 static bool
14161 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14162                                         unsigned int align,
14163                                         enum by_pieces_operation op,
14164                                         bool speed_p)
14165 {
14166   /* STORE_BY_PIECES can be used when copying a constant string, but
14167      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14168      For now we always fail this and let the move_by_pieces code copy
14169      the string from read-only memory.  */
14170   if (op == STORE_BY_PIECES)
14171     return false;
14172
14173   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14174 }
14175
14176 static rtx
14177 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14178                         int code, tree treeop0, tree treeop1)
14179 {
14180   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14181   rtx op0, op1;
14182   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14183   insn_code icode;
14184   struct expand_operand ops[4];
14185
14186   start_sequence ();
14187   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14188
14189   op_mode = GET_MODE (op0);
14190   if (op_mode == VOIDmode)
14191     op_mode = GET_MODE (op1);
14192
14193   switch (op_mode)
14194     {
14195     case E_QImode:
14196     case E_HImode:
14197     case E_SImode:
14198       cmp_mode = SImode;
14199       icode = CODE_FOR_cmpsi;
14200       break;
14201
14202     case E_DImode:
14203       cmp_mode = DImode;
14204       icode = CODE_FOR_cmpdi;
14205       break;
14206
14207     case E_SFmode:
14208       cmp_mode = SFmode;
14209       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14210       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14211       break;
14212
14213     case E_DFmode:
14214       cmp_mode = DFmode;
14215       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14216       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14217       break;
14218
14219     default:
14220       end_sequence ();
14221       return NULL_RTX;
14222     }
14223
14224   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14225   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14226   if (!op0 || !op1)
14227     {
14228       end_sequence ();
14229       return NULL_RTX;
14230     }
14231   *prep_seq = get_insns ();
14232   end_sequence ();
14233
14234   create_fixed_operand (&ops[0], op0);
14235   create_fixed_operand (&ops[1], op1);
14236
14237   start_sequence ();
14238   if (!maybe_expand_insn (icode, 2, ops))
14239     {
14240       end_sequence ();
14241       return NULL_RTX;
14242     }
14243   *gen_seq = get_insns ();
14244   end_sequence ();
14245
14246   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14247                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14248 }
14249
14250 static rtx
14251 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14252                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14253 {
14254   rtx op0, op1, target;
14255   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14256   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14257   insn_code icode;
14258   struct expand_operand ops[6];
14259   int aarch64_cond;
14260
14261   push_to_sequence (*prep_seq);
14262   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14263
14264   op_mode = GET_MODE (op0);
14265   if (op_mode == VOIDmode)
14266     op_mode = GET_MODE (op1);
14267
14268   switch (op_mode)
14269     {
14270     case E_QImode:
14271     case E_HImode:
14272     case E_SImode:
14273       cmp_mode = SImode;
14274       icode = CODE_FOR_ccmpsi;
14275       break;
14276
14277     case E_DImode:
14278       cmp_mode = DImode;
14279       icode = CODE_FOR_ccmpdi;
14280       break;
14281
14282     case E_SFmode:
14283       cmp_mode = SFmode;
14284       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14285       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14286       break;
14287
14288     case E_DFmode:
14289       cmp_mode = DFmode;
14290       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14291       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14292       break;
14293
14294     default:
14295       end_sequence ();
14296       return NULL_RTX;
14297     }
14298
14299   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14300   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14301   if (!op0 || !op1)
14302     {
14303       end_sequence ();
14304       return NULL_RTX;
14305     }
14306   *prep_seq = get_insns ();
14307   end_sequence ();
14308
14309   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14310   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14311
14312   if (bit_code != AND)
14313     {
14314       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14315                                                 GET_MODE (XEXP (prev, 0))),
14316                              VOIDmode, XEXP (prev, 0), const0_rtx);
14317       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14318     }
14319
14320   create_fixed_operand (&ops[0], XEXP (prev, 0));
14321   create_fixed_operand (&ops[1], target);
14322   create_fixed_operand (&ops[2], op0);
14323   create_fixed_operand (&ops[3], op1);
14324   create_fixed_operand (&ops[4], prev);
14325   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14326
14327   push_to_sequence (*gen_seq);
14328   if (!maybe_expand_insn (icode, 6, ops))
14329     {
14330       end_sequence ();
14331       return NULL_RTX;
14332     }
14333
14334   *gen_seq = get_insns ();
14335   end_sequence ();
14336
14337   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14338 }
14339
14340 #undef TARGET_GEN_CCMP_FIRST
14341 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14342
14343 #undef TARGET_GEN_CCMP_NEXT
14344 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14345
14346 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14347    instruction fusion of some sort.  */
14348
14349 static bool
14350 aarch64_macro_fusion_p (void)
14351 {
14352   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14353 }
14354
14355
14356 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14357    should be kept together during scheduling.  */
14358
14359 static bool
14360 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14361 {
14362   rtx set_dest;
14363   rtx prev_set = single_set (prev);
14364   rtx curr_set = single_set (curr);
14365   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14366   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14367
14368   if (!aarch64_macro_fusion_p ())
14369     return false;
14370
14371   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14372     {
14373       /* We are trying to match:
14374          prev (mov)  == (set (reg r0) (const_int imm16))
14375          curr (movk) == (set (zero_extract (reg r0)
14376                                            (const_int 16)
14377                                            (const_int 16))
14378                              (const_int imm16_1))  */
14379
14380       set_dest = SET_DEST (curr_set);
14381
14382       if (GET_CODE (set_dest) == ZERO_EXTRACT
14383           && CONST_INT_P (SET_SRC (curr_set))
14384           && CONST_INT_P (SET_SRC (prev_set))
14385           && CONST_INT_P (XEXP (set_dest, 2))
14386           && INTVAL (XEXP (set_dest, 2)) == 16
14387           && REG_P (XEXP (set_dest, 0))
14388           && REG_P (SET_DEST (prev_set))
14389           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14390         {
14391           return true;
14392         }
14393     }
14394
14395   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14396     {
14397
14398       /*  We're trying to match:
14399           prev (adrp) == (set (reg r1)
14400                               (high (symbol_ref ("SYM"))))
14401           curr (add) == (set (reg r0)
14402                              (lo_sum (reg r1)
14403                                      (symbol_ref ("SYM"))))
14404           Note that r0 need not necessarily be the same as r1, especially
14405           during pre-regalloc scheduling.  */
14406
14407       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14408           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14409         {
14410           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14411               && REG_P (XEXP (SET_SRC (curr_set), 0))
14412               && REGNO (XEXP (SET_SRC (curr_set), 0))
14413                  == REGNO (SET_DEST (prev_set))
14414               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14415                               XEXP (SET_SRC (curr_set), 1)))
14416             return true;
14417         }
14418     }
14419
14420   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14421     {
14422
14423       /* We're trying to match:
14424          prev (movk) == (set (zero_extract (reg r0)
14425                                            (const_int 16)
14426                                            (const_int 32))
14427                              (const_int imm16_1))
14428          curr (movk) == (set (zero_extract (reg r0)
14429                                            (const_int 16)
14430                                            (const_int 48))
14431                              (const_int imm16_2))  */
14432
14433       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14434           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14435           && REG_P (XEXP (SET_DEST (prev_set), 0))
14436           && REG_P (XEXP (SET_DEST (curr_set), 0))
14437           && REGNO (XEXP (SET_DEST (prev_set), 0))
14438              == REGNO (XEXP (SET_DEST (curr_set), 0))
14439           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14440           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14441           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14442           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14443           && CONST_INT_P (SET_SRC (prev_set))
14444           && CONST_INT_P (SET_SRC (curr_set)))
14445         return true;
14446
14447     }
14448   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14449     {
14450       /* We're trying to match:
14451           prev (adrp) == (set (reg r0)
14452                               (high (symbol_ref ("SYM"))))
14453           curr (ldr) == (set (reg r1)
14454                              (mem (lo_sum (reg r0)
14455                                              (symbol_ref ("SYM")))))
14456                  or
14457           curr (ldr) == (set (reg r1)
14458                              (zero_extend (mem
14459                                            (lo_sum (reg r0)
14460                                                    (symbol_ref ("SYM"))))))  */
14461       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14462           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14463         {
14464           rtx curr_src = SET_SRC (curr_set);
14465
14466           if (GET_CODE (curr_src) == ZERO_EXTEND)
14467             curr_src = XEXP (curr_src, 0);
14468
14469           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14470               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14471               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14472                  == REGNO (SET_DEST (prev_set))
14473               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14474                               XEXP (SET_SRC (prev_set), 0)))
14475               return true;
14476         }
14477     }
14478
14479   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14480        && aarch_crypto_can_dual_issue (prev, curr))
14481     return true;
14482
14483   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14484       && any_condjump_p (curr))
14485     {
14486       enum attr_type prev_type = get_attr_type (prev);
14487
14488       unsigned int condreg1, condreg2;
14489       rtx cc_reg_1;
14490       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14491       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14492
14493       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14494           && prev
14495           && modified_in_p (cc_reg_1, prev))
14496         {
14497           /* FIXME: this misses some which is considered simple arthematic
14498              instructions for ThunderX.  Simple shifts are missed here.  */
14499           if (prev_type == TYPE_ALUS_SREG
14500               || prev_type == TYPE_ALUS_IMM
14501               || prev_type == TYPE_LOGICS_REG
14502               || prev_type == TYPE_LOGICS_IMM)
14503             return true;
14504         }
14505     }
14506
14507   if (prev_set
14508       && curr_set
14509       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14510       && any_condjump_p (curr))
14511     {
14512       /* We're trying to match:
14513           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14514           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14515                                                          (const_int 0))
14516                                                  (label_ref ("SYM"))
14517                                                  (pc))  */
14518       if (SET_DEST (curr_set) == (pc_rtx)
14519           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14520           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14521           && REG_P (SET_DEST (prev_set))
14522           && REGNO (SET_DEST (prev_set))
14523              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14524         {
14525           /* Fuse ALU operations followed by conditional branch instruction.  */
14526           switch (get_attr_type (prev))
14527             {
14528             case TYPE_ALU_IMM:
14529             case TYPE_ALU_SREG:
14530             case TYPE_ADC_REG:
14531             case TYPE_ADC_IMM:
14532             case TYPE_ADCS_REG:
14533             case TYPE_ADCS_IMM:
14534             case TYPE_LOGIC_REG:
14535             case TYPE_LOGIC_IMM:
14536             case TYPE_CSEL:
14537             case TYPE_ADR:
14538             case TYPE_MOV_IMM:
14539             case TYPE_SHIFT_REG:
14540             case TYPE_SHIFT_IMM:
14541             case TYPE_BFM:
14542             case TYPE_RBIT:
14543             case TYPE_REV:
14544             case TYPE_EXTEND:
14545               return true;
14546
14547             default:;
14548             }
14549         }
14550     }
14551
14552   return false;
14553 }
14554
14555 /* Return true iff the instruction fusion described by OP is enabled.  */
14556
14557 bool
14558 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14559 {
14560   return (aarch64_tune_params.fusible_ops & op) != 0;
14561 }
14562
14563 /* If MEM is in the form of [base+offset], extract the two parts
14564    of address and set to BASE and OFFSET, otherwise return false
14565    after clearing BASE and OFFSET.  */
14566
14567 bool
14568 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14569 {
14570   rtx addr;
14571
14572   gcc_assert (MEM_P (mem));
14573
14574   addr = XEXP (mem, 0);
14575
14576   if (REG_P (addr))
14577     {
14578       *base = addr;
14579       *offset = const0_rtx;
14580       return true;
14581     }
14582
14583   if (GET_CODE (addr) == PLUS
14584       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14585     {
14586       *base = XEXP (addr, 0);
14587       *offset = XEXP (addr, 1);
14588       return true;
14589     }
14590
14591   *base = NULL_RTX;
14592   *offset = NULL_RTX;
14593
14594   return false;
14595 }
14596
14597 /* Types for scheduling fusion.  */
14598 enum sched_fusion_type
14599 {
14600   SCHED_FUSION_NONE = 0,
14601   SCHED_FUSION_LD_SIGN_EXTEND,
14602   SCHED_FUSION_LD_ZERO_EXTEND,
14603   SCHED_FUSION_LD,
14604   SCHED_FUSION_ST,
14605   SCHED_FUSION_NUM
14606 };
14607
14608 /* If INSN is a load or store of address in the form of [base+offset],
14609    extract the two parts and set to BASE and OFFSET.  Return scheduling
14610    fusion type this INSN is.  */
14611
14612 static enum sched_fusion_type
14613 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14614 {
14615   rtx x, dest, src;
14616   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14617
14618   gcc_assert (INSN_P (insn));
14619   x = PATTERN (insn);
14620   if (GET_CODE (x) != SET)
14621     return SCHED_FUSION_NONE;
14622
14623   src = SET_SRC (x);
14624   dest = SET_DEST (x);
14625
14626   machine_mode dest_mode = GET_MODE (dest);
14627
14628   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14629     return SCHED_FUSION_NONE;
14630
14631   if (GET_CODE (src) == SIGN_EXTEND)
14632     {
14633       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14634       src = XEXP (src, 0);
14635       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14636         return SCHED_FUSION_NONE;
14637     }
14638   else if (GET_CODE (src) == ZERO_EXTEND)
14639     {
14640       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14641       src = XEXP (src, 0);
14642       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14643         return SCHED_FUSION_NONE;
14644     }
14645
14646   if (GET_CODE (src) == MEM && REG_P (dest))
14647     extract_base_offset_in_addr (src, base, offset);
14648   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14649     {
14650       fusion = SCHED_FUSION_ST;
14651       extract_base_offset_in_addr (dest, base, offset);
14652     }
14653   else
14654     return SCHED_FUSION_NONE;
14655
14656   if (*base == NULL_RTX || *offset == NULL_RTX)
14657     fusion = SCHED_FUSION_NONE;
14658
14659   return fusion;
14660 }
14661
14662 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14663
14664    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14665    and PRI are only calculated for these instructions.  For other instruction,
14666    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14667    type instruction fusion can be added by returning different priorities.
14668
14669    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14670
14671 static void
14672 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14673                                int *fusion_pri, int *pri)
14674 {
14675   int tmp, off_val;
14676   rtx base, offset;
14677   enum sched_fusion_type fusion;
14678
14679   gcc_assert (INSN_P (insn));
14680
14681   tmp = max_pri - 1;
14682   fusion = fusion_load_store (insn, &base, &offset);
14683   if (fusion == SCHED_FUSION_NONE)
14684     {
14685       *pri = tmp;
14686       *fusion_pri = tmp;
14687       return;
14688     }
14689
14690   /* Set FUSION_PRI according to fusion type and base register.  */
14691   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14692
14693   /* Calculate PRI.  */
14694   tmp /= 2;
14695
14696   /* INSN with smaller offset goes first.  */
14697   off_val = (int)(INTVAL (offset));
14698   if (off_val >= 0)
14699     tmp -= (off_val & 0xfffff);
14700   else
14701     tmp += ((- off_val) & 0xfffff);
14702
14703   *pri = tmp;
14704   return;
14705 }
14706
14707 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14708    Adjust priority of sha1h instructions so they are scheduled before
14709    other SHA1 instructions.  */
14710
14711 static int
14712 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14713 {
14714   rtx x = PATTERN (insn);
14715
14716   if (GET_CODE (x) == SET)
14717     {
14718       x = SET_SRC (x);
14719
14720       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14721         return priority + 10;
14722     }
14723
14724   return priority;
14725 }
14726
14727 /* Given OPERANDS of consecutive load/store, check if we can merge
14728    them into ldp/stp.  LOAD is true if they are load instructions.
14729    MODE is the mode of memory operands.  */
14730
14731 bool
14732 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14733                                 machine_mode mode)
14734 {
14735   HOST_WIDE_INT offval_1, offval_2, msize;
14736   enum reg_class rclass_1, rclass_2;
14737   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14738
14739   if (load)
14740     {
14741       mem_1 = operands[1];
14742       mem_2 = operands[3];
14743       reg_1 = operands[0];
14744       reg_2 = operands[2];
14745       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14746       if (REGNO (reg_1) == REGNO (reg_2))
14747         return false;
14748     }
14749   else
14750     {
14751       mem_1 = operands[0];
14752       mem_2 = operands[2];
14753       reg_1 = operands[1];
14754       reg_2 = operands[3];
14755     }
14756
14757   /* The mems cannot be volatile.  */
14758   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14759     return false;
14760
14761   /* If we have SImode and slow unaligned ldp,
14762      check the alignment to be at least 8 byte. */
14763   if (mode == SImode
14764       && (aarch64_tune_params.extra_tuning_flags
14765           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14766       && !optimize_size
14767       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14768     return false;
14769
14770   /* Check if the addresses are in the form of [base+offset].  */
14771   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14772   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14773     return false;
14774   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14775   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14776     return false;
14777
14778   /* Check if the bases are same.  */
14779   if (!rtx_equal_p (base_1, base_2))
14780     return false;
14781
14782   offval_1 = INTVAL (offset_1);
14783   offval_2 = INTVAL (offset_2);
14784   msize = GET_MODE_SIZE (mode);
14785   /* Check if the offsets are consecutive.  */
14786   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14787     return false;
14788
14789   /* Check if the addresses are clobbered by load.  */
14790   if (load)
14791     {
14792       if (reg_mentioned_p (reg_1, mem_1))
14793         return false;
14794
14795       /* In increasing order, the last load can clobber the address.  */
14796       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14797       return false;
14798     }
14799
14800   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14801     rclass_1 = FP_REGS;
14802   else
14803     rclass_1 = GENERAL_REGS;
14804
14805   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14806     rclass_2 = FP_REGS;
14807   else
14808     rclass_2 = GENERAL_REGS;
14809
14810   /* Check if the registers are of same class.  */
14811   if (rclass_1 != rclass_2)
14812     return false;
14813
14814   return true;
14815 }
14816
14817 /* Given OPERANDS of consecutive load/store, check if we can merge
14818    them into ldp/stp by adjusting the offset.  LOAD is true if they
14819    are load instructions.  MODE is the mode of memory operands.
14820
14821    Given below consecutive stores:
14822
14823      str  w1, [xb, 0x100]
14824      str  w1, [xb, 0x104]
14825      str  w1, [xb, 0x108]
14826      str  w1, [xb, 0x10c]
14827
14828    Though the offsets are out of the range supported by stp, we can
14829    still pair them after adjusting the offset, like:
14830
14831      add  scratch, xb, 0x100
14832      stp  w1, w1, [scratch]
14833      stp  w1, w1, [scratch, 0x8]
14834
14835    The peephole patterns detecting this opportunity should guarantee
14836    the scratch register is avaliable.  */
14837
14838 bool
14839 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14840                                        scalar_mode mode)
14841 {
14842   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14843   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14844   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14845   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14846
14847   if (load)
14848     {
14849       reg_1 = operands[0];
14850       mem_1 = operands[1];
14851       reg_2 = operands[2];
14852       mem_2 = operands[3];
14853       reg_3 = operands[4];
14854       mem_3 = operands[5];
14855       reg_4 = operands[6];
14856       mem_4 = operands[7];
14857       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14858                   && REG_P (reg_3) && REG_P (reg_4));
14859       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14860         return false;
14861     }
14862   else
14863     {
14864       mem_1 = operands[0];
14865       reg_1 = operands[1];
14866       mem_2 = operands[2];
14867       reg_2 = operands[3];
14868       mem_3 = operands[4];
14869       reg_3 = operands[5];
14870       mem_4 = operands[6];
14871       reg_4 = operands[7];
14872     }
14873   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14874   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14875     return false;
14876
14877   /* The mems cannot be volatile.  */
14878   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14879       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14880     return false;
14881
14882   /* Check if the addresses are in the form of [base+offset].  */
14883   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14884   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14885     return false;
14886   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14887   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14888     return false;
14889   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14890   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14891     return false;
14892   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14893   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14894     return false;
14895
14896   /* Check if the bases are same.  */
14897   if (!rtx_equal_p (base_1, base_2)
14898       || !rtx_equal_p (base_2, base_3)
14899       || !rtx_equal_p (base_3, base_4))
14900     return false;
14901
14902   offval_1 = INTVAL (offset_1);
14903   offval_2 = INTVAL (offset_2);
14904   offval_3 = INTVAL (offset_3);
14905   offval_4 = INTVAL (offset_4);
14906   msize = GET_MODE_SIZE (mode);
14907   /* Check if the offsets are consecutive.  */
14908   if ((offval_1 != (offval_2 + msize)
14909        || offval_1 != (offval_3 + msize * 2)
14910        || offval_1 != (offval_4 + msize * 3))
14911       && (offval_4 != (offval_3 + msize)
14912           || offval_4 != (offval_2 + msize * 2)
14913           || offval_4 != (offval_1 + msize * 3)))
14914     return false;
14915
14916   /* Check if the addresses are clobbered by load.  */
14917   if (load)
14918     {
14919       if (reg_mentioned_p (reg_1, mem_1)
14920           || reg_mentioned_p (reg_2, mem_2)
14921           || reg_mentioned_p (reg_3, mem_3))
14922         return false;
14923
14924       /* In increasing order, the last load can clobber the address.  */
14925       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14926         return false;
14927     }
14928
14929   /* If we have SImode and slow unaligned ldp,
14930      check the alignment to be at least 8 byte. */
14931   if (mode == SImode
14932       && (aarch64_tune_params.extra_tuning_flags
14933           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14934       && !optimize_size
14935       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14936     return false;
14937
14938   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14939     rclass_1 = FP_REGS;
14940   else
14941     rclass_1 = GENERAL_REGS;
14942
14943   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14944     rclass_2 = FP_REGS;
14945   else
14946     rclass_2 = GENERAL_REGS;
14947
14948   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14949     rclass_3 = FP_REGS;
14950   else
14951     rclass_3 = GENERAL_REGS;
14952
14953   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14954     rclass_4 = FP_REGS;
14955   else
14956     rclass_4 = GENERAL_REGS;
14957
14958   /* Check if the registers are of same class.  */
14959   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14960     return false;
14961
14962   return true;
14963 }
14964
14965 /* Given OPERANDS of consecutive load/store, this function pairs them
14966    into ldp/stp after adjusting the offset.  It depends on the fact
14967    that addresses of load/store instructions are in increasing order.
14968    MODE is the mode of memory operands.  CODE is the rtl operator
14969    which should be applied to all memory operands, it's SIGN_EXTEND,
14970    ZERO_EXTEND or UNKNOWN.  */
14971
14972 bool
14973 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14974                              scalar_mode mode, RTX_CODE code)
14975 {
14976   rtx base, offset, t1, t2;
14977   rtx mem_1, mem_2, mem_3, mem_4;
14978   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14979
14980   if (load)
14981     {
14982       mem_1 = operands[1];
14983       mem_2 = operands[3];
14984       mem_3 = operands[5];
14985       mem_4 = operands[7];
14986     }
14987   else
14988     {
14989       mem_1 = operands[0];
14990       mem_2 = operands[2];
14991       mem_3 = operands[4];
14992       mem_4 = operands[6];
14993       gcc_assert (code == UNKNOWN);
14994     }
14995
14996   extract_base_offset_in_addr (mem_1, &base, &offset);
14997   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14998
14999   /* Adjust offset thus it can fit in ldp/stp instruction.  */
15000   msize = GET_MODE_SIZE (mode);
15001   stp_off_limit = msize * 0x40;
15002   off_val = INTVAL (offset);
15003   abs_off = (off_val < 0) ? -off_val : off_val;
15004   new_off = abs_off % stp_off_limit;
15005   adj_off = abs_off - new_off;
15006
15007   /* Further adjust to make sure all offsets are OK.  */
15008   if ((new_off + msize * 2) >= stp_off_limit)
15009     {
15010       adj_off += stp_off_limit;
15011       new_off -= stp_off_limit;
15012     }
15013
15014   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
15015   if (adj_off >= 0x1000)
15016     return false;
15017
15018   if (off_val < 0)
15019     {
15020       adj_off = -adj_off;
15021       new_off = -new_off;
15022     }
15023
15024   /* Create new memory references.  */
15025   mem_1 = change_address (mem_1, VOIDmode,
15026                           plus_constant (DImode, operands[8], new_off));
15027
15028   /* Check if the adjusted address is OK for ldp/stp.  */
15029   if (!aarch64_mem_pair_operand (mem_1, mode))
15030     return false;
15031
15032   msize = GET_MODE_SIZE (mode);
15033   mem_2 = change_address (mem_2, VOIDmode,
15034                           plus_constant (DImode,
15035                                          operands[8],
15036                                          new_off + msize));
15037   mem_3 = change_address (mem_3, VOIDmode,
15038                           plus_constant (DImode,
15039                                          operands[8],
15040                                          new_off + msize * 2));
15041   mem_4 = change_address (mem_4, VOIDmode,
15042                           plus_constant (DImode,
15043                                          operands[8],
15044                                          new_off + msize * 3));
15045
15046   if (code == ZERO_EXTEND)
15047     {
15048       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15049       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15050       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15051       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15052     }
15053   else if (code == SIGN_EXTEND)
15054     {
15055       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15056       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15057       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15058       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15059     }
15060
15061   if (load)
15062     {
15063       operands[1] = mem_1;
15064       operands[3] = mem_2;
15065       operands[5] = mem_3;
15066       operands[7] = mem_4;
15067     }
15068   else
15069     {
15070       operands[0] = mem_1;
15071       operands[2] = mem_2;
15072       operands[4] = mem_3;
15073       operands[6] = mem_4;
15074     }
15075
15076   /* Emit adjusting instruction.  */
15077   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15078   /* Emit ldp/stp instructions.  */
15079   t1 = gen_rtx_SET (operands[0], operands[1]);
15080   t2 = gen_rtx_SET (operands[2], operands[3]);
15081   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15082   t1 = gen_rtx_SET (operands[4], operands[5]);
15083   t2 = gen_rtx_SET (operands[6], operands[7]);
15084   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15085   return true;
15086 }
15087
15088 /* Return 1 if pseudo register should be created and used to hold
15089    GOT address for PIC code.  */
15090
15091 bool
15092 aarch64_use_pseudo_pic_reg (void)
15093 {
15094   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15095 }
15096
15097 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15098
15099 static int
15100 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15101 {
15102   switch (XINT (x, 1))
15103     {
15104     case UNSPEC_GOTSMALLPIC:
15105     case UNSPEC_GOTSMALLPIC28K:
15106     case UNSPEC_GOTTINYPIC:
15107       return 0;
15108     default:
15109       break;
15110     }
15111
15112   return default_unspec_may_trap_p (x, flags);
15113 }
15114
15115
15116 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15117    return the log2 of that value.  Otherwise return -1.  */
15118
15119 int
15120 aarch64_fpconst_pow_of_2 (rtx x)
15121 {
15122   const REAL_VALUE_TYPE *r;
15123
15124   if (!CONST_DOUBLE_P (x))
15125     return -1;
15126
15127   r = CONST_DOUBLE_REAL_VALUE (x);
15128
15129   if (REAL_VALUE_NEGATIVE (*r)
15130       || REAL_VALUE_ISNAN (*r)
15131       || REAL_VALUE_ISINF (*r)
15132       || !real_isinteger (r, DFmode))
15133     return -1;
15134
15135   return exact_log2 (real_to_integer (r));
15136 }
15137
15138 /* If X is a vector of equal CONST_DOUBLE values and that value is
15139    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15140
15141 int
15142 aarch64_vec_fpconst_pow_of_2 (rtx x)
15143 {
15144   if (GET_CODE (x) != CONST_VECTOR)
15145     return -1;
15146
15147   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15148     return -1;
15149
15150   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15151   if (firstval <= 0)
15152     return -1;
15153
15154   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15155     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15156       return -1;
15157
15158   return firstval;
15159 }
15160
15161 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15162    to float.
15163
15164    __fp16 always promotes through this hook.
15165    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15166    through the generic excess precision logic rather than here.  */
15167
15168 static tree
15169 aarch64_promoted_type (const_tree t)
15170 {
15171   if (SCALAR_FLOAT_TYPE_P (t)
15172       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15173     return float_type_node;
15174
15175   return NULL_TREE;
15176 }
15177
15178 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15179
15180 static bool
15181 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15182                            optimization_type opt_type)
15183 {
15184   switch (op)
15185     {
15186     case rsqrt_optab:
15187       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15188
15189     default:
15190       return true;
15191     }
15192 }
15193
15194 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15195    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15196
15197 static bool
15198 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15199 {
15200   return (mode == HFmode
15201           ? true
15202           : default_libgcc_floating_mode_supported_p (mode));
15203 }
15204
15205 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15206    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15207
15208 static bool
15209 aarch64_scalar_mode_supported_p (scalar_mode mode)
15210 {
15211   return (mode == HFmode
15212           ? true
15213           : default_scalar_mode_supported_p (mode));
15214 }
15215
15216 /* Set the value of FLT_EVAL_METHOD.
15217    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15218
15219     0: evaluate all operations and constants, whose semantic type has at
15220        most the range and precision of type float, to the range and
15221        precision of float; evaluate all other operations and constants to
15222        the range and precision of the semantic type;
15223
15224     N, where _FloatN is a supported interchange floating type
15225        evaluate all operations and constants, whose semantic type has at
15226        most the range and precision of _FloatN type, to the range and
15227        precision of the _FloatN type; evaluate all other operations and
15228        constants to the range and precision of the semantic type;
15229
15230    If we have the ARMv8.2-A extensions then we support _Float16 in native
15231    precision, so we should set this to 16.  Otherwise, we support the type,
15232    but want to evaluate expressions in float precision, so set this to
15233    0.  */
15234
15235 static enum flt_eval_method
15236 aarch64_excess_precision (enum excess_precision_type type)
15237 {
15238   switch (type)
15239     {
15240       case EXCESS_PRECISION_TYPE_FAST:
15241       case EXCESS_PRECISION_TYPE_STANDARD:
15242         /* We can calculate either in 16-bit range and precision or
15243            32-bit range and precision.  Make that decision based on whether
15244            we have native support for the ARMv8.2-A 16-bit floating-point
15245            instructions or not.  */
15246         return (TARGET_FP_F16INST
15247                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15248                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15249       case EXCESS_PRECISION_TYPE_IMPLICIT:
15250         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15251       default:
15252         gcc_unreachable ();
15253     }
15254   return FLT_EVAL_METHOD_UNPREDICTABLE;
15255 }
15256
15257 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15258    scheduled for speculative execution.  Reject the long-running division
15259    and square-root instructions.  */
15260
15261 static bool
15262 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15263 {
15264   switch (get_attr_type (insn))
15265     {
15266       case TYPE_SDIV:
15267       case TYPE_UDIV:
15268       case TYPE_FDIVS:
15269       case TYPE_FDIVD:
15270       case TYPE_FSQRTS:
15271       case TYPE_FSQRTD:
15272       case TYPE_NEON_FP_SQRT_S:
15273       case TYPE_NEON_FP_SQRT_D:
15274       case TYPE_NEON_FP_SQRT_S_Q:
15275       case TYPE_NEON_FP_SQRT_D_Q:
15276       case TYPE_NEON_FP_DIV_S:
15277       case TYPE_NEON_FP_DIV_D:
15278       case TYPE_NEON_FP_DIV_S_Q:
15279       case TYPE_NEON_FP_DIV_D_Q:
15280         return false;
15281       default:
15282         return true;
15283     }
15284 }
15285
15286 /* Target-specific selftests.  */
15287
15288 #if CHECKING_P
15289
15290 namespace selftest {
15291
15292 /* Selftest for the RTL loader.
15293    Verify that the RTL loader copes with a dump from
15294    print_rtx_function.  This is essentially just a test that class
15295    function_reader can handle a real dump, but it also verifies
15296    that lookup_reg_by_dump_name correctly handles hard regs.
15297    The presence of hard reg names in the dump means that the test is
15298    target-specific, hence it is in this file.  */
15299
15300 static void
15301 aarch64_test_loading_full_dump ()
15302 {
15303   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15304
15305   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15306
15307   rtx_insn *insn_1 = get_insn_by_uid (1);
15308   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15309
15310   rtx_insn *insn_15 = get_insn_by_uid (15);
15311   ASSERT_EQ (INSN, GET_CODE (insn_15));
15312   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15313
15314   /* Verify crtl->return_rtx.  */
15315   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15316   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15317   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15318 }
15319
15320 /* Run all target-specific selftests.  */
15321
15322 static void
15323 aarch64_run_selftests (void)
15324 {
15325   aarch64_test_loading_full_dump ();
15326 }
15327
15328 } // namespace selftest
15329
15330 #endif /* #if CHECKING_P */
15331
15332 #undef TARGET_ADDRESS_COST
15333 #define TARGET_ADDRESS_COST aarch64_address_cost
15334
15335 /* This hook will determines whether unnamed bitfields affect the alignment
15336    of the containing structure.  The hook returns true if the structure
15337    should inherit the alignment requirements of an unnamed bitfield's
15338    type.  */
15339 #undef TARGET_ALIGN_ANON_BITFIELD
15340 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15341
15342 #undef TARGET_ASM_ALIGNED_DI_OP
15343 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15344
15345 #undef TARGET_ASM_ALIGNED_HI_OP
15346 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15347
15348 #undef TARGET_ASM_ALIGNED_SI_OP
15349 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15350
15351 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15352 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15353   hook_bool_const_tree_hwi_hwi_const_tree_true
15354
15355 #undef TARGET_ASM_FILE_START
15356 #define TARGET_ASM_FILE_START aarch64_start_file
15357
15358 #undef TARGET_ASM_OUTPUT_MI_THUNK
15359 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15360
15361 #undef TARGET_ASM_SELECT_RTX_SECTION
15362 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15363
15364 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15365 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15366
15367 #undef TARGET_BUILD_BUILTIN_VA_LIST
15368 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15369
15370 #undef TARGET_CALLEE_COPIES
15371 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15372
15373 #undef TARGET_CAN_ELIMINATE
15374 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15375
15376 #undef TARGET_CAN_INLINE_P
15377 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15378
15379 #undef TARGET_CANNOT_FORCE_CONST_MEM
15380 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15381
15382 #undef TARGET_CASE_VALUES_THRESHOLD
15383 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15384
15385 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15386 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15387
15388 /* Only the least significant bit is used for initialization guard
15389    variables.  */
15390 #undef TARGET_CXX_GUARD_MASK_BIT
15391 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15392
15393 #undef TARGET_C_MODE_FOR_SUFFIX
15394 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15395
15396 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15397 #undef  TARGET_DEFAULT_TARGET_FLAGS
15398 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15399 #endif
15400
15401 #undef TARGET_CLASS_MAX_NREGS
15402 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15403
15404 #undef TARGET_BUILTIN_DECL
15405 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15406
15407 #undef TARGET_BUILTIN_RECIPROCAL
15408 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15409
15410 #undef TARGET_C_EXCESS_PRECISION
15411 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15412
15413 #undef  TARGET_EXPAND_BUILTIN
15414 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15415
15416 #undef TARGET_EXPAND_BUILTIN_VA_START
15417 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15418
15419 #undef TARGET_FOLD_BUILTIN
15420 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15421
15422 #undef TARGET_FUNCTION_ARG
15423 #define TARGET_FUNCTION_ARG aarch64_function_arg
15424
15425 #undef TARGET_FUNCTION_ARG_ADVANCE
15426 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15427
15428 #undef TARGET_FUNCTION_ARG_BOUNDARY
15429 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15430
15431 #undef TARGET_FUNCTION_ARG_PADDING
15432 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15433
15434 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15435 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15436
15437 #undef TARGET_FUNCTION_VALUE
15438 #define TARGET_FUNCTION_VALUE aarch64_function_value
15439
15440 #undef TARGET_FUNCTION_VALUE_REGNO_P
15441 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15442
15443 #undef TARGET_FRAME_POINTER_REQUIRED
15444 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15445
15446 #undef TARGET_GIMPLE_FOLD_BUILTIN
15447 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15448
15449 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15450 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15451
15452 #undef  TARGET_INIT_BUILTINS
15453 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15454
15455 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15456 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15457   aarch64_ira_change_pseudo_allocno_class
15458
15459 #undef TARGET_LEGITIMATE_ADDRESS_P
15460 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15461
15462 #undef TARGET_LEGITIMATE_CONSTANT_P
15463 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15464
15465 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15466 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15467   aarch64_legitimize_address_displacement
15468
15469 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15470 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15471
15472 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15473 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15474 aarch64_libgcc_floating_mode_supported_p
15475
15476 #undef TARGET_MANGLE_TYPE
15477 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15478
15479 #undef TARGET_MEMORY_MOVE_COST
15480 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15481
15482 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15483 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15484
15485 #undef TARGET_MUST_PASS_IN_STACK
15486 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15487
15488 /* This target hook should return true if accesses to volatile bitfields
15489    should use the narrowest mode possible.  It should return false if these
15490    accesses should use the bitfield container type.  */
15491 #undef TARGET_NARROW_VOLATILE_BITFIELD
15492 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15493
15494 #undef  TARGET_OPTION_OVERRIDE
15495 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15496
15497 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15498 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15499   aarch64_override_options_after_change
15500
15501 #undef TARGET_OPTION_SAVE
15502 #define TARGET_OPTION_SAVE aarch64_option_save
15503
15504 #undef TARGET_OPTION_RESTORE
15505 #define TARGET_OPTION_RESTORE aarch64_option_restore
15506
15507 #undef TARGET_OPTION_PRINT
15508 #define TARGET_OPTION_PRINT aarch64_option_print
15509
15510 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15511 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15512
15513 #undef TARGET_SET_CURRENT_FUNCTION
15514 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15515
15516 #undef TARGET_PASS_BY_REFERENCE
15517 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15518
15519 #undef TARGET_PREFERRED_RELOAD_CLASS
15520 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15521
15522 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15523 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15524
15525 #undef TARGET_PROMOTED_TYPE
15526 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15527
15528 #undef TARGET_SECONDARY_RELOAD
15529 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15530
15531 #undef TARGET_SHIFT_TRUNCATION_MASK
15532 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15533
15534 #undef TARGET_SETUP_INCOMING_VARARGS
15535 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15536
15537 #undef TARGET_STRUCT_VALUE_RTX
15538 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15539
15540 #undef TARGET_REGISTER_MOVE_COST
15541 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15542
15543 #undef TARGET_RETURN_IN_MEMORY
15544 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15545
15546 #undef TARGET_RETURN_IN_MSB
15547 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15548
15549 #undef TARGET_RTX_COSTS
15550 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15551
15552 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15553 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15554
15555 #undef TARGET_SCHED_ISSUE_RATE
15556 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15557
15558 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15559 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15560   aarch64_sched_first_cycle_multipass_dfa_lookahead
15561
15562 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15563 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15564   aarch64_first_cycle_multipass_dfa_lookahead_guard
15565
15566 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15567 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15568   aarch64_get_separate_components
15569
15570 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15571 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15572   aarch64_components_for_bb
15573
15574 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15575 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15576   aarch64_disqualify_components
15577
15578 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15579 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15580   aarch64_emit_prologue_components
15581
15582 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15583 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15584   aarch64_emit_epilogue_components
15585
15586 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15587 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15588   aarch64_set_handled_components
15589
15590 #undef TARGET_TRAMPOLINE_INIT
15591 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15592
15593 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15594 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15595
15596 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15597 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15598
15599 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15600 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15601   aarch64_builtin_support_vector_misalignment
15602
15603 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15604 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15605
15606 #undef TARGET_VECTORIZE_ADD_STMT_COST
15607 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15608
15609 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15610 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15611   aarch64_builtin_vectorization_cost
15612
15613 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15614 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15615
15616 #undef TARGET_VECTORIZE_BUILTINS
15617 #define TARGET_VECTORIZE_BUILTINS
15618
15619 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15620 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15621   aarch64_builtin_vectorized_function
15622
15623 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15624 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15625   aarch64_autovectorize_vector_sizes
15626
15627 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15628 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15629   aarch64_atomic_assign_expand_fenv
15630
15631 /* Section anchor support.  */
15632
15633 #undef TARGET_MIN_ANCHOR_OFFSET
15634 #define TARGET_MIN_ANCHOR_OFFSET -256
15635
15636 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15637    byte offset; we can do much more for larger data types, but have no way
15638    to determine the size of the access.  We assume accesses are aligned.  */
15639 #undef TARGET_MAX_ANCHOR_OFFSET
15640 #define TARGET_MAX_ANCHOR_OFFSET 4095
15641
15642 #undef TARGET_VECTOR_ALIGNMENT
15643 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15644
15645 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15646 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15647   aarch64_simd_vector_alignment_reachable
15648
15649 /* vec_perm support.  */
15650
15651 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15652 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15653   aarch64_vectorize_vec_perm_const_ok
15654
15655 #undef TARGET_INIT_LIBFUNCS
15656 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15657
15658 #undef TARGET_FIXED_CONDITION_CODE_REGS
15659 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15660
15661 #undef TARGET_FLAGS_REGNUM
15662 #define TARGET_FLAGS_REGNUM CC_REGNUM
15663
15664 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15665 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15666
15667 #undef TARGET_ASAN_SHADOW_OFFSET
15668 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15669
15670 #undef TARGET_LEGITIMIZE_ADDRESS
15671 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15672
15673 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15674 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15675   aarch64_use_by_pieces_infrastructure_p
15676
15677 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15678 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15679
15680 #undef TARGET_CAN_USE_DOLOOP_P
15681 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15682
15683 #undef TARGET_SCHED_ADJUST_PRIORITY
15684 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15685
15686 #undef TARGET_SCHED_MACRO_FUSION_P
15687 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15688
15689 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15690 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15691
15692 #undef TARGET_SCHED_FUSION_PRIORITY
15693 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15694
15695 #undef TARGET_UNSPEC_MAY_TRAP_P
15696 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15697
15698 #undef TARGET_USE_PSEUDO_PIC_REG
15699 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15700
15701 #undef TARGET_PRINT_OPERAND
15702 #define TARGET_PRINT_OPERAND aarch64_print_operand
15703
15704 #undef TARGET_PRINT_OPERAND_ADDRESS
15705 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15706
15707 #undef TARGET_OPTAB_SUPPORTED_P
15708 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15709
15710 #undef TARGET_OMIT_STRUCT_RETURN_REG
15711 #define TARGET_OMIT_STRUCT_RETURN_REG true
15712
15713 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15714 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15715 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15716
15717 #undef TARGET_HARD_REGNO_NREGS
15718 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15719 #undef TARGET_HARD_REGNO_MODE_OK
15720 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15721
15722 #undef TARGET_MODES_TIEABLE_P
15723 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15724
15725 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15726 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15727   aarch64_hard_regno_call_part_clobbered
15728
15729 #undef TARGET_CONSTANT_ALIGNMENT
15730 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
15731
15732 #if CHECKING_P
15733 #undef TARGET_RUN_TARGET_SELFTESTS
15734 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15735 #endif /* #if CHECKING_P */
15736
15737 struct gcc_target targetm = TARGET_INITIALIZER;
15738
15739 #include "gt-aarch64.h"