gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 static const struct tune_params thunderx2t99_tunings =
 826 {
 827   &thunderx2t99_extra_costs,
 828   &thunderx2t99_addrcost_table,
 829   &thunderx2t99_regmove_cost,
 830   &thunderx2t99_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   4, /* memmov_cost.  */
 834   4, /* issue_rate.  */
 835   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 836    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 837   16,   /* function_align.  */
 838   8,    /* jump_align.  */
 839   16,   /* loop_align.  */
 840   3,    /* int_reassoc_width.  */
 841   2,    /* fp_reassoc_width.  */
 842   2,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &thunderx2t99_prefetch_tune
 849 };
 850
 851 /* Support for fine-grained override of the tuning structures.  */
 852 struct aarch64_tuning_override_function
 853 {
 854   const char* name;
 855   void (*parse_override)(const char*, struct tune_params*);
 856 };
 857
 858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 860
 861 static const struct aarch64_tuning_override_function
 862 aarch64_tuning_override_functions[] =
 863 {
 864   { "fuse", aarch64_parse_fuse_string },
 865   { "tune", aarch64_parse_tune_string },
 866   { NULL, NULL }
 867 };
 868
 869 /* A processor implementing AArch64.  */
 870 struct processor
 871 {
 872   const char *const name;
 873   enum aarch64_processor ident;
 874   enum aarch64_processor sched_core;
 875   enum aarch64_arch arch;
 876   unsigned architecture_version;
 877   const unsigned long flags;
 878   const struct tune_params *const tune;
 879 };
 880
 881 /* Architectures implementing AArch64.  */
 882 static const struct processor all_architectures[] =
 883 {
 884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 885   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 886 #include "aarch64-arches.def"
 887   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 888 };
 889
 890 /* Processor cores implementing AArch64.  */
 891 static const struct processor all_cores[] =
 892 {
 893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 894   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 895   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 896   FLAGS, &COSTS##_tunings},
 897 #include "aarch64-cores.def"
 898   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 899     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 900   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 901 };
 902
 903
 904 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 905    handling code or by target attributes.  */
 906 static const struct processor *selected_arch;
 907 static const struct processor *selected_cpu;
 908 static const struct processor *selected_tune;
 909
 910 /* The current tuning set.  */
 911 struct tune_params aarch64_tune_params = generic_tunings;
 912
 913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 914
 915 /* An ISA extension in the co-processor and main instruction set space.  */
 916 struct aarch64_option_extension
 917 {
 918   const char *const name;
 919   const unsigned long flags_on;
 920   const unsigned long flags_off;
 921 };
 922
 923 typedef enum aarch64_cond_code
 924 {
 925   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 926   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 927   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 928 }
 929 aarch64_cc;
 930
 931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 932
 933 /* The condition codes of the processor, and the inverse function.  */
 934 static const char * const aarch64_condition_codes[] =
 935 {
 936   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 937   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 938 };
 939
 940 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 941 const char *
 942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 943                         const char * branch_format)
 944 {
 945     rtx_code_label * tmp_label = gen_label_rtx ();
 946     char label_buf[256];
 947     char buffer[128];
 948     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 949                                  CODE_LABEL_NUMBER (tmp_label));
 950     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 951     rtx dest_label = operands[pos_label];
 952     operands[pos_label] = tmp_label;
 953
 954     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 955     output_asm_insn (buffer, operands);
 956
 957     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 958     operands[pos_label] = dest_label;
 959     output_asm_insn (buffer, operands);
 960     return "";
 961 }
 962
 963 void
 964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 965 {
 966   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 967   if (TARGET_GENERAL_REGS_ONLY)
 968     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 969   else
 970     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 971 }
 972
 973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 974    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 975    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 976    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 977    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 978    irrespectively of its cost results in bad allocations with many redundant
 979    int<->FP moves which are expensive on various cores.
 980    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 981    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 982    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 983    Otherwise set the allocno class depending on the mode.
 984    The result of this is that it is no longer inefficient to have a higher
 985    memory move cost than the register move cost.
 986 */
 987
 988 static reg_class_t
 989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 990                                          reg_class_t best_class)
 991 {
 992   machine_mode mode;
 993
 994   if (allocno_class != ALL_REGS)
 995     return allocno_class;
 996
 997   if (best_class != ALL_REGS)
 998     return best_class;
 999
1000   mode = PSEUDO_REGNO_MODE (regno);
1001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007   if (GET_MODE_UNIT_SIZE (mode) == 4)
1008     return aarch64_tune_params.min_div_recip_mul_sf;
1009   return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014                              machine_mode mode)
1015 {
1016   if (VECTOR_MODE_P (mode))
1017     return aarch64_tune_params.vec_reassoc_width;
1018   if (INTEGRAL_MODE_P (mode))
1019     return aarch64_tune_params.int_reassoc_width;
1020   if (FLOAT_MODE_P (mode))
1021     return aarch64_tune_params.fp_reassoc_width;
1022   return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029    if (GP_REGNUM_P (regno))
1030      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031    else if (regno == SP_REGNUM)
1032      return AARCH64_DWARF_SP;
1033    else if (FP_REGNUM_P (regno))
1034      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037       equivalent DWARF register.  */
1038    return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes.  */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045   return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes.  */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052   return aarch64_vector_mode_supported_p (mode)
1053          || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059                                 unsigned HOST_WIDE_INT nelems)
1060 {
1061   if (TARGET_SIMD
1062       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064       && (nelems >= 2 && nelems <= 4))
1065     return true;
1066
1067   return false;
1068 }
1069
1070 /* Implement TARGET_HARD_REGNO_NREGS.  */
1071
1072 static unsigned int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075   switch (aarch64_regno_regclass (regno))
1076     {
1077     case FP_REGS:
1078     case FP_LO_REGS:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080     default:
1081       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082     }
1083   gcc_unreachable ();
1084 }
1085
1086 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1087
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091   if (GET_MODE_CLASS (mode) == MODE_CC)
1092     return regno == CC_REGNUM;
1093
1094   if (regno == SP_REGNUM)
1095     /* The purpose of comparing with ptr_mode is to support the
1096        global register variable associated with the stack pointer
1097        register via the syntax of asm ("wsp") in ILP32.  */
1098     return mode == Pmode || mode == ptr_mode;
1099
1100   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101     return mode == Pmode;
1102
1103   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104     return true;
1105
1106   if (FP_REGNUM_P (regno))
1107     {
1108       if (aarch64_vect_struct_mode_p (mode))
1109         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1110       else
1111         return true;
1112     }
1113
1114   return false;
1115 }
1116
1117 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1118    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1119    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1120
1121 static bool
1122 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1123 {
1124   return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1125 }
1126
1127 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1128 machine_mode
1129 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1130                                      machine_mode mode)
1131 {
1132   /* Handle modes that fit within single registers.  */
1133   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1134     {
1135       if (GET_MODE_SIZE (mode) >= 4)
1136         return mode;
1137       else
1138         return SImode;
1139     }
1140   /* Fall back to generic for multi-reg and very large modes.  */
1141   else
1142     return choose_hard_reg_mode (regno, nregs, false);
1143 }
1144
1145 /* Return true if calls to DECL should be treated as
1146    long-calls (ie called via a register).  */
1147 static bool
1148 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1149 {
1150   return false;
1151 }
1152
1153 /* Return true if calls to symbol-ref SYM should be treated as
1154    long-calls (ie called via a register).  */
1155 bool
1156 aarch64_is_long_call_p (rtx sym)
1157 {
1158   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1159 }
1160
1161 /* Return true if calls to symbol-ref SYM should not go through
1162    plt stubs.  */
1163
1164 bool
1165 aarch64_is_noplt_call_p (rtx sym)
1166 {
1167   const_tree decl = SYMBOL_REF_DECL (sym);
1168
1169   if (flag_pic
1170       && decl
1171       && (!flag_plt
1172           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1173       && !targetm.binds_local_p (decl))
1174     return true;
1175
1176   return false;
1177 }
1178
1179 /* Return true if the offsets to a zero/sign-extract operation
1180    represent an expression that matches an extend operation.  The
1181    operands represent the paramters from
1182
1183    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1184 bool
1185 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1186                                 rtx extract_imm)
1187 {
1188   HOST_WIDE_INT mult_val, extract_val;
1189
1190   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1191     return false;
1192
1193   mult_val = INTVAL (mult_imm);
1194   extract_val = INTVAL (extract_imm);
1195
1196   if (extract_val > 8
1197       && extract_val < GET_MODE_BITSIZE (mode)
1198       && exact_log2 (extract_val & ~7) > 0
1199       && (extract_val & 7) <= 4
1200       && mult_val == (1 << (extract_val & 7)))
1201     return true;
1202
1203   return false;
1204 }
1205
1206 /* Emit an insn that's a simple single-set.  Both the operands must be
1207    known to be valid.  */
1208 inline static rtx_insn *
1209 emit_set_insn (rtx x, rtx y)
1210 {
1211   return emit_insn (gen_rtx_SET (x, y));
1212 }
1213
1214 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1215    return the rtx for register 0 in the proper mode.  */
1216 rtx
1217 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1218 {
1219   machine_mode mode = SELECT_CC_MODE (code, x, y);
1220   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1221
1222   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1223   return cc_reg;
1224 }
1225
1226 /* Build the SYMBOL_REF for __tls_get_addr.  */
1227
1228 static GTY(()) rtx tls_get_addr_libfunc;
1229
1230 rtx
1231 aarch64_tls_get_addr (void)
1232 {
1233   if (!tls_get_addr_libfunc)
1234     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1235   return tls_get_addr_libfunc;
1236 }
1237
1238 /* Return the TLS model to use for ADDR.  */
1239
1240 static enum tls_model
1241 tls_symbolic_operand_type (rtx addr)
1242 {
1243   enum tls_model tls_kind = TLS_MODEL_NONE;
1244   rtx sym, addend;
1245
1246   if (GET_CODE (addr) == CONST)
1247     {
1248       split_const (addr, &sym, &addend);
1249       if (GET_CODE (sym) == SYMBOL_REF)
1250         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1251     }
1252   else if (GET_CODE (addr) == SYMBOL_REF)
1253     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1254
1255   return tls_kind;
1256 }
1257
1258 /* We'll allow lo_sum's in addresses in our legitimate addresses
1259    so that combine would take care of combining addresses where
1260    necessary, but for generation purposes, we'll generate the address
1261    as :
1262    RTL                               Absolute
1263    tmp = hi (symbol_ref);            adrp  x1, foo
1264    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1265                                      nop
1266
1267    PIC                               TLS
1268    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1269    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1270                                      bl   __tls_get_addr
1271                                      nop
1272
1273    Load TLS symbol, depending on TLS mechanism and TLS access model.
1274
1275    Global Dynamic - Traditional TLS:
1276    adrp tmp, :tlsgd:imm
1277    add  dest, tmp, #:tlsgd_lo12:imm
1278    bl   __tls_get_addr
1279
1280    Global Dynamic - TLS Descriptors:
1281    adrp dest, :tlsdesc:imm
1282    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1283    add  dest, dest, #:tlsdesc_lo12:imm
1284    blr  tmp
1285    mrs  tp, tpidr_el0
1286    add  dest, dest, tp
1287
1288    Initial Exec:
1289    mrs  tp, tpidr_el0
1290    adrp tmp, :gottprel:imm
1291    ldr  dest, [tmp, #:gottprel_lo12:imm]
1292    add  dest, dest, tp
1293
1294    Local Exec:
1295    mrs  tp, tpidr_el0
1296    add  t0, tp, #:tprel_hi12:imm, lsl #12
1297    add  t0, t0, #:tprel_lo12_nc:imm
1298 */
1299
1300 static void
1301 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1302                                    enum aarch64_symbol_type type)
1303 {
1304   switch (type)
1305     {
1306     case SYMBOL_SMALL_ABSOLUTE:
1307       {
1308         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1309         rtx tmp_reg = dest;
1310         machine_mode mode = GET_MODE (dest);
1311
1312         gcc_assert (mode == Pmode || mode == ptr_mode);
1313
1314         if (can_create_pseudo_p ())
1315           tmp_reg = gen_reg_rtx (mode);
1316
1317         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1318         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1319         return;
1320       }
1321
1322     case SYMBOL_TINY_ABSOLUTE:
1323       emit_insn (gen_rtx_SET (dest, imm));
1324       return;
1325
1326     case SYMBOL_SMALL_GOT_28K:
1327       {
1328         machine_mode mode = GET_MODE (dest);
1329         rtx gp_rtx = pic_offset_table_rtx;
1330         rtx insn;
1331         rtx mem;
1332
1333         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1334            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1335            decide rtx costs, in which case pic_offset_table_rtx is not
1336            initialized.  For that case no need to generate the first adrp
1337            instruction as the final cost for global variable access is
1338            one instruction.  */
1339         if (gp_rtx != NULL)
1340           {
1341             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1342                using the page base as GOT base, the first page may be wasted,
1343                in the worst scenario, there is only 28K space for GOT).
1344
1345                The generate instruction sequence for accessing global variable
1346                is:
1347
1348                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1349
1350                Only one instruction needed. But we must initialize
1351                pic_offset_table_rtx properly.  We generate initialize insn for
1352                every global access, and allow CSE to remove all redundant.
1353
1354                The final instruction sequences will look like the following
1355                for multiply global variables access.
1356
1357                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1358
1359                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1360                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1361                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1362                  ...  */
1363
1364             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1365             crtl->uses_pic_offset_table = 1;
1366             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1367
1368             if (mode != GET_MODE (gp_rtx))
1369              gp_rtx = gen_lowpart (mode, gp_rtx);
1370
1371           }
1372
1373         if (mode == ptr_mode)
1374           {
1375             if (mode == DImode)
1376               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1377             else
1378               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1379
1380             mem = XVECEXP (SET_SRC (insn), 0, 0);
1381           }
1382         else
1383           {
1384             gcc_assert (mode == Pmode);
1385
1386             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1387             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1388           }
1389
1390         /* The operand is expected to be MEM.  Whenever the related insn
1391            pattern changed, above code which calculate mem should be
1392            updated.  */
1393         gcc_assert (GET_CODE (mem) == MEM);
1394         MEM_READONLY_P (mem) = 1;
1395         MEM_NOTRAP_P (mem) = 1;
1396         emit_insn (insn);
1397         return;
1398       }
1399
1400     case SYMBOL_SMALL_GOT_4G:
1401       {
1402         /* In ILP32, the mode of dest can be either SImode or DImode,
1403            while the got entry is always of SImode size.  The mode of
1404            dest depends on how dest is used: if dest is assigned to a
1405            pointer (e.g. in the memory), it has SImode; it may have
1406            DImode if dest is dereferenced to access the memeory.
1407            This is why we have to handle three different ldr_got_small
1408            patterns here (two patterns for ILP32).  */
1409
1410         rtx insn;
1411         rtx mem;
1412         rtx tmp_reg = dest;
1413         machine_mode mode = GET_MODE (dest);
1414
1415         if (can_create_pseudo_p ())
1416           tmp_reg = gen_reg_rtx (mode);
1417
1418         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1419         if (mode == ptr_mode)
1420           {
1421             if (mode == DImode)
1422               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1423             else
1424               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1425
1426             mem = XVECEXP (SET_SRC (insn), 0, 0);
1427           }
1428         else
1429           {
1430             gcc_assert (mode == Pmode);
1431
1432             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1433             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1434           }
1435
1436         gcc_assert (GET_CODE (mem) == MEM);
1437         MEM_READONLY_P (mem) = 1;
1438         MEM_NOTRAP_P (mem) = 1;
1439         emit_insn (insn);
1440         return;
1441       }
1442
1443     case SYMBOL_SMALL_TLSGD:
1444       {
1445         rtx_insn *insns;
1446         machine_mode mode = GET_MODE (dest);
1447         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1448
1449         start_sequence ();
1450         if (TARGET_ILP32)
1451           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1452         else
1453           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1454         insns = get_insns ();
1455         end_sequence ();
1456
1457         RTL_CONST_CALL_P (insns) = 1;
1458         emit_libcall_block (insns, dest, result, imm);
1459         return;
1460       }
1461
1462     case SYMBOL_SMALL_TLSDESC:
1463       {
1464         machine_mode mode = GET_MODE (dest);
1465         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1466         rtx tp;
1467
1468         gcc_assert (mode == Pmode || mode == ptr_mode);
1469
1470         /* In ILP32, the got entry is always of SImode size.  Unlike
1471            small GOT, the dest is fixed at reg 0.  */
1472         if (TARGET_ILP32)
1473           emit_insn (gen_tlsdesc_small_si (imm));
1474         else
1475           emit_insn (gen_tlsdesc_small_di (imm));
1476         tp = aarch64_load_tp (NULL);
1477
1478         if (mode != Pmode)
1479           tp = gen_lowpart (mode, tp);
1480
1481         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1482         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1483         return;
1484       }
1485
1486     case SYMBOL_SMALL_TLSIE:
1487       {
1488         /* In ILP32, the mode of dest can be either SImode or DImode,
1489            while the got entry is always of SImode size.  The mode of
1490            dest depends on how dest is used: if dest is assigned to a
1491            pointer (e.g. in the memory), it has SImode; it may have
1492            DImode if dest is dereferenced to access the memeory.
1493            This is why we have to handle three different tlsie_small
1494            patterns here (two patterns for ILP32).  */
1495         machine_mode mode = GET_MODE (dest);
1496         rtx tmp_reg = gen_reg_rtx (mode);
1497         rtx tp = aarch64_load_tp (NULL);
1498
1499         if (mode == ptr_mode)
1500           {
1501             if (mode == DImode)
1502               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1503             else
1504               {
1505                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1506                 tp = gen_lowpart (mode, tp);
1507               }
1508           }
1509         else
1510           {
1511             gcc_assert (mode == Pmode);
1512             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1513           }
1514
1515         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1516         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1517         return;
1518       }
1519
1520     case SYMBOL_TLSLE12:
1521     case SYMBOL_TLSLE24:
1522     case SYMBOL_TLSLE32:
1523     case SYMBOL_TLSLE48:
1524       {
1525         machine_mode mode = GET_MODE (dest);
1526         rtx tp = aarch64_load_tp (NULL);
1527
1528         if (mode != Pmode)
1529           tp = gen_lowpart (mode, tp);
1530
1531         switch (type)
1532           {
1533           case SYMBOL_TLSLE12:
1534             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1535                         (dest, tp, imm));
1536             break;
1537           case SYMBOL_TLSLE24:
1538             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1539                         (dest, tp, imm));
1540           break;
1541           case SYMBOL_TLSLE32:
1542             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1543                         (dest, imm));
1544             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1545                         (dest, dest, tp));
1546           break;
1547           case SYMBOL_TLSLE48:
1548             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1549                         (dest, imm));
1550             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1551                         (dest, dest, tp));
1552             break;
1553           default:
1554             gcc_unreachable ();
1555           }
1556
1557         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1558         return;
1559       }
1560
1561     case SYMBOL_TINY_GOT:
1562       emit_insn (gen_ldr_got_tiny (dest, imm));
1563       return;
1564
1565     case SYMBOL_TINY_TLSIE:
1566       {
1567         machine_mode mode = GET_MODE (dest);
1568         rtx tp = aarch64_load_tp (NULL);
1569
1570         if (mode == ptr_mode)
1571           {
1572             if (mode == DImode)
1573               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1574             else
1575               {
1576                 tp = gen_lowpart (mode, tp);
1577                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1578               }
1579           }
1580         else
1581           {
1582             gcc_assert (mode == Pmode);
1583             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1584           }
1585
1586         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1587         return;
1588       }
1589
1590     default:
1591       gcc_unreachable ();
1592     }
1593 }
1594
1595 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1596    handle all moves if !can_create_pseudo_p ().  The distinction is
1597    important because, unlike emit_move_insn, the move expanders know
1598    how to force Pmode objects into the constant pool even when the
1599    constant pool address is not itself legitimate.  */
1600 static rtx
1601 aarch64_emit_move (rtx dest, rtx src)
1602 {
1603   return (can_create_pseudo_p ()
1604           ? emit_move_insn (dest, src)
1605           : emit_move_insn_1 (dest, src));
1606 }
1607
1608 /* Split a 128-bit move operation into two 64-bit move operations,
1609    taking care to handle partial overlap of register to register
1610    copies.  Special cases are needed when moving between GP regs and
1611    FP regs.  SRC can be a register, constant or memory; DST a register
1612    or memory.  If either operand is memory it must not have any side
1613    effects.  */
1614 void
1615 aarch64_split_128bit_move (rtx dst, rtx src)
1616 {
1617   rtx dst_lo, dst_hi;
1618   rtx src_lo, src_hi;
1619
1620   machine_mode mode = GET_MODE (dst);
1621
1622   gcc_assert (mode == TImode || mode == TFmode);
1623   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1624   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1625
1626   if (REG_P (dst) && REG_P (src))
1627     {
1628       int src_regno = REGNO (src);
1629       int dst_regno = REGNO (dst);
1630
1631       /* Handle FP <-> GP regs.  */
1632       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1633         {
1634           src_lo = gen_lowpart (word_mode, src);
1635           src_hi = gen_highpart (word_mode, src);
1636
1637           if (mode == TImode)
1638             {
1639               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1640               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1641             }
1642           else
1643             {
1644               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1645               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1646             }
1647           return;
1648         }
1649       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1650         {
1651           dst_lo = gen_lowpart (word_mode, dst);
1652           dst_hi = gen_highpart (word_mode, dst);
1653
1654           if (mode == TImode)
1655             {
1656               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1657               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1658             }
1659           else
1660             {
1661               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1662               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1663             }
1664           return;
1665         }
1666     }
1667
1668   dst_lo = gen_lowpart (word_mode, dst);
1669   dst_hi = gen_highpart (word_mode, dst);
1670   src_lo = gen_lowpart (word_mode, src);
1671   src_hi = gen_highpart_mode (word_mode, mode, src);
1672
1673   /* At most one pairing may overlap.  */
1674   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1675     {
1676       aarch64_emit_move (dst_hi, src_hi);
1677       aarch64_emit_move (dst_lo, src_lo);
1678     }
1679   else
1680     {
1681       aarch64_emit_move (dst_lo, src_lo);
1682       aarch64_emit_move (dst_hi, src_hi);
1683     }
1684 }
1685
1686 bool
1687 aarch64_split_128bit_move_p (rtx dst, rtx src)
1688 {
1689   return (! REG_P (src)
1690           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1691 }
1692
1693 /* Split a complex SIMD combine.  */
1694
1695 void
1696 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1697 {
1698   machine_mode src_mode = GET_MODE (src1);
1699   machine_mode dst_mode = GET_MODE (dst);
1700
1701   gcc_assert (VECTOR_MODE_P (dst_mode));
1702   gcc_assert (register_operand (dst, dst_mode)
1703               && register_operand (src1, src_mode)
1704               && register_operand (src2, src_mode));
1705
1706   rtx (*gen) (rtx, rtx, rtx);
1707
1708   switch (src_mode)
1709     {
1710     case E_V8QImode:
1711       gen = gen_aarch64_simd_combinev8qi;
1712       break;
1713     case E_V4HImode:
1714       gen = gen_aarch64_simd_combinev4hi;
1715       break;
1716     case E_V2SImode:
1717       gen = gen_aarch64_simd_combinev2si;
1718       break;
1719     case E_V4HFmode:
1720       gen = gen_aarch64_simd_combinev4hf;
1721       break;
1722     case E_V2SFmode:
1723       gen = gen_aarch64_simd_combinev2sf;
1724       break;
1725     case E_DImode:
1726       gen = gen_aarch64_simd_combinedi;
1727       break;
1728     case E_DFmode:
1729       gen = gen_aarch64_simd_combinedf;
1730       break;
1731     default:
1732       gcc_unreachable ();
1733     }
1734
1735   emit_insn (gen (dst, src1, src2));
1736   return;
1737 }
1738
1739 /* Split a complex SIMD move.  */
1740
1741 void
1742 aarch64_split_simd_move (rtx dst, rtx src)
1743 {
1744   machine_mode src_mode = GET_MODE (src);
1745   machine_mode dst_mode = GET_MODE (dst);
1746
1747   gcc_assert (VECTOR_MODE_P (dst_mode));
1748
1749   if (REG_P (dst) && REG_P (src))
1750     {
1751       rtx (*gen) (rtx, rtx);
1752
1753       gcc_assert (VECTOR_MODE_P (src_mode));
1754
1755       switch (src_mode)
1756         {
1757         case E_V16QImode:
1758           gen = gen_aarch64_split_simd_movv16qi;
1759           break;
1760         case E_V8HImode:
1761           gen = gen_aarch64_split_simd_movv8hi;
1762           break;
1763         case E_V4SImode:
1764           gen = gen_aarch64_split_simd_movv4si;
1765           break;
1766         case E_V2DImode:
1767           gen = gen_aarch64_split_simd_movv2di;
1768           break;
1769         case E_V8HFmode:
1770           gen = gen_aarch64_split_simd_movv8hf;
1771           break;
1772         case E_V4SFmode:
1773           gen = gen_aarch64_split_simd_movv4sf;
1774           break;
1775         case E_V2DFmode:
1776           gen = gen_aarch64_split_simd_movv2df;
1777           break;
1778         default:
1779           gcc_unreachable ();
1780         }
1781
1782       emit_insn (gen (dst, src));
1783       return;
1784     }
1785 }
1786
1787 bool
1788 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1789                               machine_mode ymode, rtx y)
1790 {
1791   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1792   gcc_assert (r != NULL);
1793   return rtx_equal_p (x, r);
1794 }
1795
1796
1797 static rtx
1798 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1799 {
1800   if (can_create_pseudo_p ())
1801     return force_reg (mode, value);
1802   else
1803     {
1804       x = aarch64_emit_move (x, value);
1805       return x;
1806     }
1807 }
1808
1809
1810 static rtx
1811 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1812                     HOST_WIDE_INT offset)
1813 {
1814   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1815     {
1816       rtx high;
1817       /* Load the full offset into a register.  This
1818          might be improvable in the future.  */
1819       high = GEN_INT (offset);
1820       offset = 0;
1821       high = aarch64_force_temporary (mode, temp, high);
1822       reg = aarch64_force_temporary (mode, temp,
1823                                      gen_rtx_PLUS (mode, high, reg));
1824     }
1825   return plus_constant (mode, reg, offset);
1826 }
1827
1828 static int
1829 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1830                                 scalar_int_mode mode)
1831 {
1832   int i;
1833   unsigned HOST_WIDE_INT val, val2, mask;
1834   int one_match, zero_match;
1835   int num_insns;
1836
1837   val = INTVAL (imm);
1838
1839   if (aarch64_move_imm (val, mode))
1840     {
1841       if (generate)
1842         emit_insn (gen_rtx_SET (dest, imm));
1843       return 1;
1844     }
1845
1846   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1847      (with XXXX non-zero). In that case check to see if the move can be done in
1848      a smaller mode.  */
1849   val2 = val & 0xffffffff;
1850   if (mode == DImode
1851       && aarch64_move_imm (val2, SImode)
1852       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1853     {
1854       if (generate)
1855         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1856
1857       /* Check if we have to emit a second instruction by checking to see
1858          if any of the upper 32 bits of the original DI mode value is set.  */
1859       if (val == val2)
1860         return 1;
1861
1862       i = (val >> 48) ? 48 : 32;
1863
1864       if (generate)
1865          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1866                                     GEN_INT ((val >> i) & 0xffff)));
1867
1868       return 2;
1869     }
1870
1871   if ((val >> 32) == 0 || mode == SImode)
1872     {
1873       if (generate)
1874         {
1875           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1876           if (mode == SImode)
1877             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1878                                        GEN_INT ((val >> 16) & 0xffff)));
1879           else
1880             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1881                                        GEN_INT ((val >> 16) & 0xffff)));
1882         }
1883       return 2;
1884     }
1885
1886   /* Remaining cases are all for DImode.  */
1887
1888   mask = 0xffff;
1889   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1890     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1891   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1892     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1893
1894   if (zero_match != 2 && one_match != 2)
1895     {
1896       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1897          For a 64-bit bitmask try whether changing 16 bits to all ones or
1898          zeroes creates a valid bitmask.  To check any repeated bitmask,
1899          try using 16 bits from the other 32-bit half of val.  */
1900
1901       for (i = 0; i < 64; i += 16, mask <<= 16)
1902         {
1903           val2 = val & ~mask;
1904           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1905             break;
1906           val2 = val | mask;
1907           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1908             break;
1909           val2 = val2 & ~mask;
1910           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1911           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1912             break;
1913         }
1914       if (i != 64)
1915         {
1916           if (generate)
1917             {
1918               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1919               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1920                                          GEN_INT ((val >> i) & 0xffff)));
1921             }
1922           return 2;
1923         }
1924     }
1925
1926   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1927      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1928      otherwise skip zero bits.  */
1929
1930   num_insns = 1;
1931   mask = 0xffff;
1932   val2 = one_match > zero_match ? ~val : val;
1933   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1934
1935   if (generate)
1936     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1937                                            ? (val | ~(mask << i))
1938                                            : (val & (mask << i)))));
1939   for (i += 16; i < 64; i += 16)
1940     {
1941       if ((val2 & (mask << i)) == 0)
1942         continue;
1943       if (generate)
1944         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1945                                    GEN_INT ((val >> i) & 0xffff)));
1946       num_insns ++;
1947     }
1948
1949   return num_insns;
1950 }
1951
1952
1953 void
1954 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1955 {
1956   machine_mode mode = GET_MODE (dest);
1957
1958   gcc_assert (mode == SImode || mode == DImode);
1959
1960   /* Check on what type of symbol it is.  */
1961   scalar_int_mode int_mode;
1962   if ((GET_CODE (imm) == SYMBOL_REF
1963        || GET_CODE (imm) == LABEL_REF
1964        || GET_CODE (imm) == CONST)
1965       && is_a <scalar_int_mode> (mode, &int_mode))
1966     {
1967       rtx mem, base, offset;
1968       enum aarch64_symbol_type sty;
1969
1970       /* If we have (const (plus symbol offset)), separate out the offset
1971          before we start classifying the symbol.  */
1972       split_const (imm, &base, &offset);
1973
1974       sty = aarch64_classify_symbol (base, offset);
1975       switch (sty)
1976         {
1977         case SYMBOL_FORCE_TO_MEM:
1978           if (offset != const0_rtx
1979               && targetm.cannot_force_const_mem (int_mode, imm))
1980             {
1981               gcc_assert (can_create_pseudo_p ());
1982               base = aarch64_force_temporary (int_mode, dest, base);
1983               base = aarch64_add_offset (int_mode, NULL, base,
1984                                          INTVAL (offset));
1985               aarch64_emit_move (dest, base);
1986               return;
1987             }
1988
1989           mem = force_const_mem (ptr_mode, imm);
1990           gcc_assert (mem);
1991
1992           /* If we aren't generating PC relative literals, then
1993              we need to expand the literal pool access carefully.
1994              This is something that needs to be done in a number
1995              of places, so could well live as a separate function.  */
1996           if (!aarch64_pcrelative_literal_loads)
1997             {
1998               gcc_assert (can_create_pseudo_p ());
1999               base = gen_reg_rtx (ptr_mode);
2000               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2001               if (ptr_mode != Pmode)
2002                 base = convert_memory_address (Pmode, base);
2003               mem = gen_rtx_MEM (ptr_mode, base);
2004             }
2005
2006           if (int_mode != ptr_mode)
2007             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2008
2009           emit_insn (gen_rtx_SET (dest, mem));
2010
2011           return;
2012
2013         case SYMBOL_SMALL_TLSGD:
2014         case SYMBOL_SMALL_TLSDESC:
2015         case SYMBOL_SMALL_TLSIE:
2016         case SYMBOL_SMALL_GOT_28K:
2017         case SYMBOL_SMALL_GOT_4G:
2018         case SYMBOL_TINY_GOT:
2019         case SYMBOL_TINY_TLSIE:
2020           if (offset != const0_rtx)
2021             {
2022               gcc_assert(can_create_pseudo_p ());
2023               base = aarch64_force_temporary (int_mode, dest, base);
2024               base = aarch64_add_offset (int_mode, NULL, base,
2025                                          INTVAL (offset));
2026               aarch64_emit_move (dest, base);
2027               return;
2028             }
2029           /* FALLTHRU */
2030
2031         case SYMBOL_SMALL_ABSOLUTE:
2032         case SYMBOL_TINY_ABSOLUTE:
2033         case SYMBOL_TLSLE12:
2034         case SYMBOL_TLSLE24:
2035         case SYMBOL_TLSLE32:
2036         case SYMBOL_TLSLE48:
2037           aarch64_load_symref_appropriately (dest, imm, sty);
2038           return;
2039
2040         default:
2041           gcc_unreachable ();
2042         }
2043     }
2044
2045   if (!CONST_INT_P (imm))
2046     {
2047       if (GET_CODE (imm) == HIGH)
2048         emit_insn (gen_rtx_SET (dest, imm));
2049       else
2050         {
2051           rtx mem = force_const_mem (mode, imm);
2052           gcc_assert (mem);
2053           emit_insn (gen_rtx_SET (dest, mem));
2054         }
2055
2056       return;
2057     }
2058
2059   aarch64_internal_mov_immediate (dest, imm, true,
2060                                   as_a <scalar_int_mode> (mode));
2061 }
2062
2063 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2064    temporary value if necessary.  FRAME_RELATED_P should be true if
2065    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2066    to the generated instructions.  If SCRATCHREG is known to hold
2067    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2068    immediate again.
2069
2070    Since this function may be used to adjust the stack pointer, we must
2071    ensure that it cannot cause transient stack deallocation (for example
2072    by first incrementing SP and then decrementing when adjusting by a
2073    large immediate).  */
2074
2075 static void
2076 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2077                                int scratchreg, HOST_WIDE_INT delta,
2078                                bool frame_related_p, bool emit_move_imm)
2079 {
2080   HOST_WIDE_INT mdelta = abs_hwi (delta);
2081   rtx this_rtx = gen_rtx_REG (mode, regnum);
2082   rtx_insn *insn;
2083
2084   if (!mdelta)
2085     return;
2086
2087   /* Single instruction adjustment.  */
2088   if (aarch64_uimm12_shift (mdelta))
2089     {
2090       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2091       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2092       return;
2093     }
2094
2095   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2096      Only do this if mdelta is not a 16-bit move as adjusting using a move
2097      is better.  */
2098   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2099     {
2100       HOST_WIDE_INT low_off = mdelta & 0xfff;
2101
2102       low_off = delta < 0 ? -low_off : low_off;
2103       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2104       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2105       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2106       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2107       return;
2108     }
2109
2110   /* Emit a move immediate if required and an addition/subtraction.  */
2111   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2112   if (emit_move_imm)
2113     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2114   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2115                               : gen_add2_insn (this_rtx, scratch_rtx));
2116   if (frame_related_p)
2117     {
2118       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2119       rtx adj = plus_constant (mode, this_rtx, delta);
2120       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2121     }
2122 }
2123
2124 static inline void
2125 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2126                       HOST_WIDE_INT delta)
2127 {
2128   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2129 }
2130
2131 static inline void
2132 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2133 {
2134   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2135                                  true, emit_move_imm);
2136 }
2137
2138 static inline void
2139 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2140 {
2141   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2142                                  frame_related_p, true);
2143 }
2144
2145 static bool
2146 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2147                                  tree exp ATTRIBUTE_UNUSED)
2148 {
2149   /* Currently, always true.  */
2150   return true;
2151 }
2152
2153 /* Implement TARGET_PASS_BY_REFERENCE.  */
2154
2155 static bool
2156 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2157                            machine_mode mode,
2158                            const_tree type,
2159                            bool named ATTRIBUTE_UNUSED)
2160 {
2161   HOST_WIDE_INT size;
2162   machine_mode dummymode;
2163   int nregs;
2164
2165   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2166   size = (mode == BLKmode && type)
2167     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2168
2169   /* Aggregates are passed by reference based on their size.  */
2170   if (type && AGGREGATE_TYPE_P (type))
2171     {
2172       size = int_size_in_bytes (type);
2173     }
2174
2175   /* Variable sized arguments are always returned by reference.  */
2176   if (size < 0)
2177     return true;
2178
2179   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2180   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2181                                                &dummymode, &nregs,
2182                                                NULL))
2183     return false;
2184
2185   /* Arguments which are variable sized or larger than 2 registers are
2186      passed by reference unless they are a homogenous floating point
2187      aggregate.  */
2188   return size > 2 * UNITS_PER_WORD;
2189 }
2190
2191 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2192 static bool
2193 aarch64_return_in_msb (const_tree valtype)
2194 {
2195   machine_mode dummy_mode;
2196   int dummy_int;
2197
2198   /* Never happens in little-endian mode.  */
2199   if (!BYTES_BIG_ENDIAN)
2200     return false;
2201
2202   /* Only composite types smaller than or equal to 16 bytes can
2203      be potentially returned in registers.  */
2204   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2205       || int_size_in_bytes (valtype) <= 0
2206       || int_size_in_bytes (valtype) > 16)
2207     return false;
2208
2209   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2210      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2211      is always passed/returned in the least significant bits of fp/simd
2212      register(s).  */
2213   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2214                                                &dummy_mode, &dummy_int, NULL))
2215     return false;
2216
2217   return true;
2218 }
2219
2220 /* Implement TARGET_FUNCTION_VALUE.
2221    Define how to find the value returned by a function.  */
2222
2223 static rtx
2224 aarch64_function_value (const_tree type, const_tree func,
2225                         bool outgoing ATTRIBUTE_UNUSED)
2226 {
2227   machine_mode mode;
2228   int unsignedp;
2229   int count;
2230   machine_mode ag_mode;
2231
2232   mode = TYPE_MODE (type);
2233   if (INTEGRAL_TYPE_P (type))
2234     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2235
2236   if (aarch64_return_in_msb (type))
2237     {
2238       HOST_WIDE_INT size = int_size_in_bytes (type);
2239
2240       if (size % UNITS_PER_WORD != 0)
2241         {
2242           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2243           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2244         }
2245     }
2246
2247   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2248                                                &ag_mode, &count, NULL))
2249     {
2250       if (!aarch64_composite_type_p (type, mode))
2251         {
2252           gcc_assert (count == 1 && mode == ag_mode);
2253           return gen_rtx_REG (mode, V0_REGNUM);
2254         }
2255       else
2256         {
2257           int i;
2258           rtx par;
2259
2260           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2261           for (i = 0; i < count; i++)
2262             {
2263               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2264               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2265                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2266               XVECEXP (par, 0, i) = tmp;
2267             }
2268           return par;
2269         }
2270     }
2271   else
2272     return gen_rtx_REG (mode, R0_REGNUM);
2273 }
2274
2275 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2276    Return true if REGNO is the number of a hard register in which the values
2277    of called function may come back.  */
2278
2279 static bool
2280 aarch64_function_value_regno_p (const unsigned int regno)
2281 {
2282   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2283      of 16-byte return values are: 128-bit integers and 16-byte small
2284      structures (excluding homogeneous floating-point aggregates).  */
2285   if (regno == R0_REGNUM || regno == R1_REGNUM)
2286     return true;
2287
2288   /* Up to four fp/simd registers can return a function value, e.g. a
2289      homogeneous floating-point aggregate having four members.  */
2290   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2291     return TARGET_FLOAT;
2292
2293   return false;
2294 }
2295
2296 /* Implement TARGET_RETURN_IN_MEMORY.
2297
2298    If the type T of the result of a function is such that
2299      void func (T arg)
2300    would require that arg be passed as a value in a register (or set of
2301    registers) according to the parameter passing rules, then the result
2302    is returned in the same registers as would be used for such an
2303    argument.  */
2304
2305 static bool
2306 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2307 {
2308   HOST_WIDE_INT size;
2309   machine_mode ag_mode;
2310   int count;
2311
2312   if (!AGGREGATE_TYPE_P (type)
2313       && TREE_CODE (type) != COMPLEX_TYPE
2314       && TREE_CODE (type) != VECTOR_TYPE)
2315     /* Simple scalar types always returned in registers.  */
2316     return false;
2317
2318   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2319                                                type,
2320                                                &ag_mode,
2321                                                &count,
2322                                                NULL))
2323     return false;
2324
2325   /* Types larger than 2 registers returned in memory.  */
2326   size = int_size_in_bytes (type);
2327   return (size < 0 || size > 2 * UNITS_PER_WORD);
2328 }
2329
2330 static bool
2331 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2332                                const_tree type, int *nregs)
2333 {
2334   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2335   return aarch64_vfp_is_call_or_return_candidate (mode,
2336                                                   type,
2337                                                   &pcum->aapcs_vfp_rmode,
2338                                                   nregs,
2339                                                   NULL);
2340 }
2341
2342 /* Given MODE and TYPE of a function argument, return the alignment in
2343    bits.  The idea is to suppress any stronger alignment requested by
2344    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2345    This is a helper function for local use only.  */
2346
2347 static unsigned int
2348 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2349 {
2350   if (!type)
2351     return GET_MODE_ALIGNMENT (mode);
2352
2353   if (integer_zerop (TYPE_SIZE (type)))
2354     return 0;
2355
2356   gcc_assert (TYPE_MODE (type) == mode);
2357
2358   if (!AGGREGATE_TYPE_P (type))
2359     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2360
2361   if (TREE_CODE (type) == ARRAY_TYPE)
2362     return TYPE_ALIGN (TREE_TYPE (type));
2363
2364   unsigned int alignment = 0;
2365   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2366     if (TREE_CODE (field) == FIELD_DECL)
2367       alignment = std::max (alignment, DECL_ALIGN (field));
2368
2369   return alignment;
2370 }
2371
2372 /* Layout a function argument according to the AAPCS64 rules.  The rule
2373    numbers refer to the rule numbers in the AAPCS64.  */
2374
2375 static void
2376 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2377                     const_tree type,
2378                     bool named ATTRIBUTE_UNUSED)
2379 {
2380   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2381   int ncrn, nvrn, nregs;
2382   bool allocate_ncrn, allocate_nvrn;
2383   HOST_WIDE_INT size;
2384
2385   /* We need to do this once per argument.  */
2386   if (pcum->aapcs_arg_processed)
2387     return;
2388
2389   pcum->aapcs_arg_processed = true;
2390
2391   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2392   size
2393     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2394                 UNITS_PER_WORD);
2395
2396   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2397   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2398                                                  mode,
2399                                                  type,
2400                                                  &nregs);
2401
2402   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2403      The following code thus handles passing by SIMD/FP registers first.  */
2404
2405   nvrn = pcum->aapcs_nvrn;
2406
2407   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2408      and homogenous short-vector aggregates (HVA).  */
2409   if (allocate_nvrn)
2410     {
2411       if (!TARGET_FLOAT)
2412         aarch64_err_no_fpadvsimd (mode, "argument");
2413
2414       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2415         {
2416           pcum->aapcs_nextnvrn = nvrn + nregs;
2417           if (!aarch64_composite_type_p (type, mode))
2418             {
2419               gcc_assert (nregs == 1);
2420               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2421             }
2422           else
2423             {
2424               rtx par;
2425               int i;
2426               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2427               for (i = 0; i < nregs; i++)
2428                 {
2429                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2430                                          V0_REGNUM + nvrn + i);
2431                   tmp = gen_rtx_EXPR_LIST
2432                     (VOIDmode, tmp,
2433                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2434                   XVECEXP (par, 0, i) = tmp;
2435                 }
2436               pcum->aapcs_reg = par;
2437             }
2438           return;
2439         }
2440       else
2441         {
2442           /* C.3 NSRN is set to 8.  */
2443           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2444           goto on_stack;
2445         }
2446     }
2447
2448   ncrn = pcum->aapcs_ncrn;
2449   nregs = size / UNITS_PER_WORD;
2450
2451   /* C6 - C9.  though the sign and zero extension semantics are
2452      handled elsewhere.  This is the case where the argument fits
2453      entirely general registers.  */
2454   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2455     {
2456
2457       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2458
2459       /* C.8 if the argument has an alignment of 16 then the NGRN is
2460          rounded up to the next even number.  */
2461       if (nregs == 2
2462           && ncrn % 2
2463           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2464              comparison is there because for > 16 * BITS_PER_UNIT
2465              alignment nregs should be > 2 and therefore it should be
2466              passed by reference rather than value.  */
2467           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2468         {
2469           ++ncrn;
2470           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2471         }
2472
2473       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2474          A reg is still generated for it, but the caller should be smart
2475          enough not to use it.  */
2476       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2477         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2478       else
2479         {
2480           rtx par;
2481           int i;
2482
2483           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2484           for (i = 0; i < nregs; i++)
2485             {
2486               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2487               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2488                                        GEN_INT (i * UNITS_PER_WORD));
2489               XVECEXP (par, 0, i) = tmp;
2490             }
2491           pcum->aapcs_reg = par;
2492         }
2493
2494       pcum->aapcs_nextncrn = ncrn + nregs;
2495       return;
2496     }
2497
2498   /* C.11  */
2499   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2500
2501   /* The argument is passed on stack; record the needed number of words for
2502      this argument and align the total size if necessary.  */
2503 on_stack:
2504   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2505
2506   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2507     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2508                                        16 / UNITS_PER_WORD);
2509   return;
2510 }
2511
2512 /* Implement TARGET_FUNCTION_ARG.  */
2513
2514 static rtx
2515 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2516                       const_tree type, bool named)
2517 {
2518   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2519   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2520
2521   if (mode == VOIDmode)
2522     return NULL_RTX;
2523
2524   aarch64_layout_arg (pcum_v, mode, type, named);
2525   return pcum->aapcs_reg;
2526 }
2527
2528 void
2529 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2530                            const_tree fntype ATTRIBUTE_UNUSED,
2531                            rtx libname ATTRIBUTE_UNUSED,
2532                            const_tree fndecl ATTRIBUTE_UNUSED,
2533                            unsigned n_named ATTRIBUTE_UNUSED)
2534 {
2535   pcum->aapcs_ncrn = 0;
2536   pcum->aapcs_nvrn = 0;
2537   pcum->aapcs_nextncrn = 0;
2538   pcum->aapcs_nextnvrn = 0;
2539   pcum->pcs_variant = ARM_PCS_AAPCS64;
2540   pcum->aapcs_reg = NULL_RTX;
2541   pcum->aapcs_arg_processed = false;
2542   pcum->aapcs_stack_words = 0;
2543   pcum->aapcs_stack_size = 0;
2544
2545   if (!TARGET_FLOAT
2546       && fndecl && TREE_PUBLIC (fndecl)
2547       && fntype && fntype != error_mark_node)
2548     {
2549       const_tree type = TREE_TYPE (fntype);
2550       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2551       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2552       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2553                                                    &mode, &nregs, NULL))
2554         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2555     }
2556   return;
2557 }
2558
2559 static void
2560 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2561                               machine_mode mode,
2562                               const_tree type,
2563                               bool named)
2564 {
2565   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2566   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2567     {
2568       aarch64_layout_arg (pcum_v, mode, type, named);
2569       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2570                   != (pcum->aapcs_stack_words != 0));
2571       pcum->aapcs_arg_processed = false;
2572       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2573       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2574       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2575       pcum->aapcs_stack_words = 0;
2576       pcum->aapcs_reg = NULL_RTX;
2577     }
2578 }
2579
2580 bool
2581 aarch64_function_arg_regno_p (unsigned regno)
2582 {
2583   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2584           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2585 }
2586
2587 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2588    PARM_BOUNDARY bits of alignment, but will be given anything up
2589    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2590    that both before and after the layout of each argument, the Next
2591    Stacked Argument Address (NSAA) will have a minimum alignment of
2592    8 bytes.  */
2593
2594 static unsigned int
2595 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2596 {
2597   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2598   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2599 }
2600
2601 /* Implement TARGET_FUNCTION_ARG_PADDING.
2602
2603    Small aggregate types are placed in the lowest memory address.
2604
2605    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2606
2607 static pad_direction
2608 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2609 {
2610   /* On little-endian targets, the least significant byte of every stack
2611      argument is passed at the lowest byte address of the stack slot.  */
2612   if (!BYTES_BIG_ENDIAN)
2613     return PAD_UPWARD;
2614
2615   /* Otherwise, integral, floating-point and pointer types are padded downward:
2616      the least significant byte of a stack argument is passed at the highest
2617      byte address of the stack slot.  */
2618   if (type
2619       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2620          || POINTER_TYPE_P (type))
2621       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2622     return PAD_DOWNWARD;
2623
2624   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2625   return PAD_UPWARD;
2626 }
2627
2628 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2629
2630    It specifies padding for the last (may also be the only)
2631    element of a block move between registers and memory.  If
2632    assuming the block is in the memory, padding upward means that
2633    the last element is padded after its highest significant byte,
2634    while in downward padding, the last element is padded at the
2635    its least significant byte side.
2636
2637    Small aggregates and small complex types are always padded
2638    upwards.
2639
2640    We don't need to worry about homogeneous floating-point or
2641    short-vector aggregates; their move is not affected by the
2642    padding direction determined here.  Regardless of endianness,
2643    each element of such an aggregate is put in the least
2644    significant bits of a fp/simd register.
2645
2646    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2647    register has useful data, and return the opposite if the most
2648    significant byte does.  */
2649
2650 bool
2651 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2652                      bool first ATTRIBUTE_UNUSED)
2653 {
2654
2655   /* Small composite types are always padded upward.  */
2656   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2657     {
2658       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2659                             : GET_MODE_SIZE (mode));
2660       if (size < 2 * UNITS_PER_WORD)
2661         return true;
2662     }
2663
2664   /* Otherwise, use the default padding.  */
2665   return !BYTES_BIG_ENDIAN;
2666 }
2667
2668 static scalar_int_mode
2669 aarch64_libgcc_cmp_return_mode (void)
2670 {
2671   return SImode;
2672 }
2673
2674 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2675
2676 /* We use the 12-bit shifted immediate arithmetic instructions so values
2677    must be multiple of (1 << 12), i.e. 4096.  */
2678 #define ARITH_FACTOR 4096
2679
2680 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2681 #error Cannot use simple address calculation for stack probing
2682 #endif
2683
2684 /* The pair of scratch registers used for stack probing.  */
2685 #define PROBE_STACK_FIRST_REG  9
2686 #define PROBE_STACK_SECOND_REG 10
2687
2688 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2689    inclusive.  These are offsets from the current stack pointer.  */
2690
2691 static void
2692 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2693 {
2694   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2695
2696   /* See the same assertion on PROBE_INTERVAL above.  */
2697   gcc_assert ((first % ARITH_FACTOR) == 0);
2698
2699   /* See if we have a constant small number of probes to generate.  If so,
2700      that's the easy case.  */
2701   if (size <= PROBE_INTERVAL)
2702     {
2703       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2704
2705       emit_set_insn (reg1,
2706                      plus_constant (Pmode,
2707                                     stack_pointer_rtx, -(first + base)));
2708       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2709     }
2710
2711   /* The run-time loop is made up of 8 insns in the generic case while the
2712      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2713   else if (size <= 4 * PROBE_INTERVAL)
2714     {
2715       HOST_WIDE_INT i, rem;
2716
2717       emit_set_insn (reg1,
2718                      plus_constant (Pmode,
2719                                     stack_pointer_rtx,
2720                                     -(first + PROBE_INTERVAL)));
2721       emit_stack_probe (reg1);
2722
2723       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2724          it exceeds SIZE.  If only two probes are needed, this will not
2725          generate any code.  Then probe at FIRST + SIZE.  */
2726       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2727         {
2728           emit_set_insn (reg1,
2729                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2730           emit_stack_probe (reg1);
2731         }
2732
2733       rem = size - (i - PROBE_INTERVAL);
2734       if (rem > 256)
2735         {
2736           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2737
2738           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2739           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2740         }
2741       else
2742         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2743     }
2744
2745   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2746      extra careful with variables wrapping around because we might be at
2747      the very top (or the very bottom) of the address space and we have
2748      to be able to handle this case properly; in particular, we use an
2749      equality test for the loop condition.  */
2750   else
2751     {
2752       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2753
2754       /* Step 1: round SIZE to the previous multiple of the interval.  */
2755
2756       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2757
2758
2759       /* Step 2: compute initial and final value of the loop counter.  */
2760
2761       /* TEST_ADDR = SP + FIRST.  */
2762       emit_set_insn (reg1,
2763                      plus_constant (Pmode, stack_pointer_rtx, -first));
2764
2765       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2766       HOST_WIDE_INT adjustment = - (first + rounded_size);
2767       if (! aarch64_uimm12_shift (adjustment))
2768         {
2769           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2770                                           true, Pmode);
2771           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2772         }
2773       else
2774         {
2775           emit_set_insn (reg2,
2776                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2777         }
2778
2779       /* Step 3: the loop
2780
2781          do
2782            {
2783              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2784              probe at TEST_ADDR
2785            }
2786          while (TEST_ADDR != LAST_ADDR)
2787
2788          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2789          until it is equal to ROUNDED_SIZE.  */
2790
2791       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2792
2793
2794       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2795          that SIZE is equal to ROUNDED_SIZE.  */
2796
2797       if (size != rounded_size)
2798         {
2799           HOST_WIDE_INT rem = size - rounded_size;
2800
2801           if (rem > 256)
2802             {
2803               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2804
2805               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2806               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2807             }
2808           else
2809             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2810         }
2811     }
2812
2813   /* Make sure nothing is scheduled before we are done.  */
2814   emit_insn (gen_blockage ());
2815 }
2816
2817 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2818    absolute addresses.  */
2819
2820 const char *
2821 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2822 {
2823   static int labelno = 0;
2824   char loop_lab[32];
2825   rtx xops[2];
2826
2827   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2828
2829   /* Loop.  */
2830   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2831
2832   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2833   xops[0] = reg1;
2834   xops[1] = GEN_INT (PROBE_INTERVAL);
2835   output_asm_insn ("sub\t%0, %0, %1", xops);
2836
2837   /* Probe at TEST_ADDR.  */
2838   output_asm_insn ("str\txzr, [%0]", xops);
2839
2840   /* Test if TEST_ADDR == LAST_ADDR.  */
2841   xops[1] = reg2;
2842   output_asm_insn ("cmp\t%0, %1", xops);
2843
2844   /* Branch.  */
2845   fputs ("\tb.ne\t", asm_out_file);
2846   assemble_name_raw (asm_out_file, loop_lab);
2847   fputc ('\n', asm_out_file);
2848
2849   return "";
2850 }
2851
2852 static bool
2853 aarch64_frame_pointer_required (void)
2854 {
2855   /* In aarch64_override_options_after_change
2856      flag_omit_leaf_frame_pointer turns off the frame pointer by
2857      default.  Turn it back on now if we've not got a leaf
2858      function.  */
2859   if (flag_omit_leaf_frame_pointer
2860       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2861     return true;
2862
2863   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2864   if (crtl->calls_eh_return)
2865     return true;
2866
2867   return false;
2868 }
2869
2870 /* Mark the registers that need to be saved by the callee and calculate
2871    the size of the callee-saved registers area and frame record (both FP
2872    and LR may be omitted).  */
2873 static void
2874 aarch64_layout_frame (void)
2875 {
2876   HOST_WIDE_INT offset = 0;
2877   int regno, last_fp_reg = INVALID_REGNUM;
2878
2879   if (reload_completed && cfun->machine->frame.laid_out)
2880     return;
2881
2882 #define SLOT_NOT_REQUIRED (-2)
2883 #define SLOT_REQUIRED     (-1)
2884
2885   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2886   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2887
2888   /* First mark all the registers that really need to be saved...  */
2889   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2890     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2891
2892   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2893     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2894
2895   /* ... that includes the eh data registers (if needed)...  */
2896   if (crtl->calls_eh_return)
2897     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2898       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2899         = SLOT_REQUIRED;
2900
2901   /* ... and any callee saved register that dataflow says is live.  */
2902   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2903     if (df_regs_ever_live_p (regno)
2904         && (regno == R30_REGNUM
2905             || !call_used_regs[regno]))
2906       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2907
2908   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2909     if (df_regs_ever_live_p (regno)
2910         && !call_used_regs[regno])
2911       {
2912         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2913         last_fp_reg = regno;
2914       }
2915
2916   if (frame_pointer_needed)
2917     {
2918       /* FP and LR are placed in the linkage record.  */
2919       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2920       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2921       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2922       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2923       offset += 2 * UNITS_PER_WORD;
2924     }
2925
2926   /* Now assign stack slots for them.  */
2927   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2928     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2929       {
2930         cfun->machine->frame.reg_offset[regno] = offset;
2931         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2932           cfun->machine->frame.wb_candidate1 = regno;
2933         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2934           cfun->machine->frame.wb_candidate2 = regno;
2935         offset += UNITS_PER_WORD;
2936       }
2937
2938   HOST_WIDE_INT max_int_offset = offset;
2939   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2940   bool has_align_gap = offset != max_int_offset;
2941
2942   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2943     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2944       {
2945         /* If there is an alignment gap between integer and fp callee-saves,
2946            allocate the last fp register to it if possible.  */
2947         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2948           {
2949             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2950             break;
2951           }
2952
2953         cfun->machine->frame.reg_offset[regno] = offset;
2954         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2955           cfun->machine->frame.wb_candidate1 = regno;
2956         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2957                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2958           cfun->machine->frame.wb_candidate2 = regno;
2959         offset += UNITS_PER_WORD;
2960       }
2961
2962   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2963
2964   cfun->machine->frame.saved_regs_size = offset;
2965
2966   HOST_WIDE_INT varargs_and_saved_regs_size
2967     = offset + cfun->machine->frame.saved_varargs_size;
2968
2969   cfun->machine->frame.hard_fp_offset
2970     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2971                 STACK_BOUNDARY / BITS_PER_UNIT);
2972
2973   cfun->machine->frame.frame_size
2974     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2975                 + crtl->outgoing_args_size,
2976                 STACK_BOUNDARY / BITS_PER_UNIT);
2977
2978   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2979
2980   cfun->machine->frame.initial_adjust = 0;
2981   cfun->machine->frame.final_adjust = 0;
2982   cfun->machine->frame.callee_adjust = 0;
2983   cfun->machine->frame.callee_offset = 0;
2984
2985   HOST_WIDE_INT max_push_offset = 0;
2986   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2987     max_push_offset = 512;
2988   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2989     max_push_offset = 256;
2990
2991   if (cfun->machine->frame.frame_size < max_push_offset
2992       && crtl->outgoing_args_size == 0)
2993     {
2994       /* Simple, small frame with no outgoing arguments:
2995          stp reg1, reg2, [sp, -frame_size]!
2996          stp reg3, reg4, [sp, 16]  */
2997       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2998     }
2999   else if ((crtl->outgoing_args_size
3000             + cfun->machine->frame.saved_regs_size < 512)
3001            && !(cfun->calls_alloca
3002                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3003     {
3004       /* Frame with small outgoing arguments:
3005          sub sp, sp, frame_size
3006          stp reg1, reg2, [sp, outgoing_args_size]
3007          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3008       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3009       cfun->machine->frame.callee_offset
3010         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3011     }
3012   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3013     {
3014       /* Frame with large outgoing arguments but a small local area:
3015          stp reg1, reg2, [sp, -hard_fp_offset]!
3016          stp reg3, reg4, [sp, 16]
3017          sub sp, sp, outgoing_args_size  */
3018       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3019       cfun->machine->frame.final_adjust
3020         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3021     }
3022   else if (!frame_pointer_needed
3023            && varargs_and_saved_regs_size < max_push_offset)
3024     {
3025       /* Frame with large local area and outgoing arguments (this pushes the
3026          callee-saves first, followed by the locals and outgoing area):
3027          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3028          stp reg3, reg4, [sp, 16]
3029          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3030       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3031       cfun->machine->frame.final_adjust
3032         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3033       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3034       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3035     }
3036   else
3037     {
3038       /* Frame with large local area and outgoing arguments using frame pointer:
3039          sub sp, sp, hard_fp_offset
3040          stp x29, x30, [sp, 0]
3041          add x29, sp, 0
3042          stp reg3, reg4, [sp, 16]
3043          sub sp, sp, outgoing_args_size  */
3044       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3045       cfun->machine->frame.final_adjust
3046         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3047     }
3048
3049   cfun->machine->frame.laid_out = true;
3050 }
3051
3052 /* Return true if the register REGNO is saved on entry to
3053    the current function.  */
3054
3055 static bool
3056 aarch64_register_saved_on_entry (int regno)
3057 {
3058   return cfun->machine->frame.reg_offset[regno] >= 0;
3059 }
3060
3061 /* Return the next register up from REGNO up to LIMIT for the callee
3062    to save.  */
3063
3064 static unsigned
3065 aarch64_next_callee_save (unsigned regno, unsigned limit)
3066 {
3067   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3068     regno ++;
3069   return regno;
3070 }
3071
3072 /* Push the register number REGNO of mode MODE to the stack with write-back
3073    adjusting the stack by ADJUSTMENT.  */
3074
3075 static void
3076 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3077                            HOST_WIDE_INT adjustment)
3078  {
3079   rtx base_rtx = stack_pointer_rtx;
3080   rtx insn, reg, mem;
3081
3082   reg = gen_rtx_REG (mode, regno);
3083   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3084                             plus_constant (Pmode, base_rtx, -adjustment));
3085   mem = gen_frame_mem (mode, mem);
3086
3087   insn = emit_move_insn (mem, reg);
3088   RTX_FRAME_RELATED_P (insn) = 1;
3089 }
3090
3091 /* Generate and return an instruction to store the pair of registers
3092    REG and REG2 of mode MODE to location BASE with write-back adjusting
3093    the stack location BASE by ADJUSTMENT.  */
3094
3095 static rtx
3096 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3097                           HOST_WIDE_INT adjustment)
3098 {
3099   switch (mode)
3100     {
3101     case E_DImode:
3102       return gen_storewb_pairdi_di (base, base, reg, reg2,
3103                                     GEN_INT (-adjustment),
3104                                     GEN_INT (UNITS_PER_WORD - adjustment));
3105     case E_DFmode:
3106       return gen_storewb_pairdf_di (base, base, reg, reg2,
3107                                     GEN_INT (-adjustment),
3108                                     GEN_INT (UNITS_PER_WORD - adjustment));
3109     default:
3110       gcc_unreachable ();
3111     }
3112 }
3113
3114 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3115    stack pointer by ADJUSTMENT.  */
3116
3117 static void
3118 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3119 {
3120   rtx_insn *insn;
3121   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3122
3123   if (regno2 == INVALID_REGNUM)
3124     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3125
3126   rtx reg1 = gen_rtx_REG (mode, regno1);
3127   rtx reg2 = gen_rtx_REG (mode, regno2);
3128
3129   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3130                                               reg2, adjustment));
3131   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3132   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3133   RTX_FRAME_RELATED_P (insn) = 1;
3134 }
3135
3136 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3137    adjusting it by ADJUSTMENT afterwards.  */
3138
3139 static rtx
3140 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3141                          HOST_WIDE_INT adjustment)
3142 {
3143   switch (mode)
3144     {
3145     case E_DImode:
3146       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3147                                    GEN_INT (UNITS_PER_WORD));
3148     case E_DFmode:
3149       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3150                                    GEN_INT (UNITS_PER_WORD));
3151     default:
3152       gcc_unreachable ();
3153     }
3154 }
3155
3156 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3157    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3158    into CFI_OPS.  */
3159
3160 static void
3161 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3162                   rtx *cfi_ops)
3163 {
3164   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3165   rtx reg1 = gen_rtx_REG (mode, regno1);
3166
3167   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3168
3169   if (regno2 == INVALID_REGNUM)
3170     {
3171       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3172       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3173       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3174     }
3175   else
3176     {
3177       rtx reg2 = gen_rtx_REG (mode, regno2);
3178       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3179       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3180                                           reg2, adjustment));
3181     }
3182 }
3183
3184 /* Generate and return a store pair instruction of mode MODE to store
3185    register REG1 to MEM1 and register REG2 to MEM2.  */
3186
3187 static rtx
3188 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3189                         rtx reg2)
3190 {
3191   switch (mode)
3192     {
3193     case E_DImode:
3194       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3195
3196     case E_DFmode:
3197       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3198
3199     default:
3200       gcc_unreachable ();
3201     }
3202 }
3203
3204 /* Generate and regurn a load pair isntruction of mode MODE to load register
3205    REG1 from MEM1 and register REG2 from MEM2.  */
3206
3207 static rtx
3208 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3209                        rtx mem2)
3210 {
3211   switch (mode)
3212     {
3213     case E_DImode:
3214       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3215
3216     case E_DFmode:
3217       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3218
3219     default:
3220       gcc_unreachable ();
3221     }
3222 }
3223
3224 /* Return TRUE if return address signing should be enabled for the current
3225    function, otherwise return FALSE.  */
3226
3227 bool
3228 aarch64_return_address_signing_enabled (void)
3229 {
3230   /* This function should only be called after frame laid out.   */
3231   gcc_assert (cfun->machine->frame.laid_out);
3232
3233   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3234      if it's LR is pushed onto stack.  */
3235   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3236           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3237               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3238 }
3239
3240 /* Emit code to save the callee-saved registers from register number START
3241    to LIMIT to the stack at the location starting at offset START_OFFSET,
3242    skipping any write-back candidates if SKIP_WB is true.  */
3243
3244 static void
3245 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3246                            unsigned start, unsigned limit, bool skip_wb)
3247 {
3248   rtx_insn *insn;
3249   unsigned regno;
3250   unsigned regno2;
3251
3252   for (regno = aarch64_next_callee_save (start, limit);
3253        regno <= limit;
3254        regno = aarch64_next_callee_save (regno + 1, limit))
3255     {
3256       rtx reg, mem;
3257       HOST_WIDE_INT offset;
3258
3259       if (skip_wb
3260           && (regno == cfun->machine->frame.wb_candidate1
3261               || regno == cfun->machine->frame.wb_candidate2))
3262         continue;
3263
3264       if (cfun->machine->reg_is_wrapped_separately[regno])
3265        continue;
3266
3267       reg = gen_rtx_REG (mode, regno);
3268       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3269       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3270                                                 offset));
3271
3272       regno2 = aarch64_next_callee_save (regno + 1, limit);
3273
3274       if (regno2 <= limit
3275           && !cfun->machine->reg_is_wrapped_separately[regno2]
3276           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3277               == cfun->machine->frame.reg_offset[regno2]))
3278
3279         {
3280           rtx reg2 = gen_rtx_REG (mode, regno2);
3281           rtx mem2;
3282
3283           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3284           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3285                                                      offset));
3286           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3287                                                     reg2));
3288
3289           /* The first part of a frame-related parallel insn is
3290              always assumed to be relevant to the frame
3291              calculations; subsequent parts, are only
3292              frame-related if explicitly marked.  */
3293           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3294           regno = regno2;
3295         }
3296       else
3297         insn = emit_move_insn (mem, reg);
3298
3299       RTX_FRAME_RELATED_P (insn) = 1;
3300     }
3301 }
3302
3303 /* Emit code to restore the callee registers of mode MODE from register
3304    number START up to and including LIMIT.  Restore from the stack offset
3305    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3306    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3307
3308 static void
3309 aarch64_restore_callee_saves (machine_mode mode,
3310                               HOST_WIDE_INT start_offset, unsigned start,
3311                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3312 {
3313   rtx base_rtx = stack_pointer_rtx;
3314   unsigned regno;
3315   unsigned regno2;
3316   HOST_WIDE_INT offset;
3317
3318   for (regno = aarch64_next_callee_save (start, limit);
3319        regno <= limit;
3320        regno = aarch64_next_callee_save (regno + 1, limit))
3321     {
3322       if (cfun->machine->reg_is_wrapped_separately[regno])
3323        continue;
3324
3325       rtx reg, mem;
3326
3327       if (skip_wb
3328           && (regno == cfun->machine->frame.wb_candidate1
3329               || regno == cfun->machine->frame.wb_candidate2))
3330         continue;
3331
3332       reg = gen_rtx_REG (mode, regno);
3333       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3334       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3335
3336       regno2 = aarch64_next_callee_save (regno + 1, limit);
3337
3338       if (regno2 <= limit
3339           && !cfun->machine->reg_is_wrapped_separately[regno2]
3340           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3341               == cfun->machine->frame.reg_offset[regno2]))
3342         {
3343           rtx reg2 = gen_rtx_REG (mode, regno2);
3344           rtx mem2;
3345
3346           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3347           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3348           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3349
3350           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3351           regno = regno2;
3352         }
3353       else
3354         emit_move_insn (reg, mem);
3355       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3356     }
3357 }
3358
3359 static inline bool
3360 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3361                                HOST_WIDE_INT offset)
3362 {
3363   return offset >= -256 && offset < 256;
3364 }
3365
3366 static inline bool
3367 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3368 {
3369   return (offset >= 0
3370           && offset < 4096 * GET_MODE_SIZE (mode)
3371           && offset % GET_MODE_SIZE (mode) == 0);
3372 }
3373
3374 bool
3375 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3376 {
3377   return (offset >= -64 * GET_MODE_SIZE (mode)
3378           && offset < 64 * GET_MODE_SIZE (mode)
3379           && offset % GET_MODE_SIZE (mode) == 0);
3380 }
3381
3382 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3383
3384 static sbitmap
3385 aarch64_get_separate_components (void)
3386 {
3387   aarch64_layout_frame ();
3388
3389   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3390   bitmap_clear (components);
3391
3392   /* The registers we need saved to the frame.  */
3393   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3394     if (aarch64_register_saved_on_entry (regno))
3395       {
3396         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3397         if (!frame_pointer_needed)
3398           offset += cfun->machine->frame.frame_size
3399                     - cfun->machine->frame.hard_fp_offset;
3400         /* Check that we can access the stack slot of the register with one
3401            direct load with no adjustments needed.  */
3402         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3403           bitmap_set_bit (components, regno);
3404       }
3405
3406   /* Don't mess with the hard frame pointer.  */
3407   if (frame_pointer_needed)
3408     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3409
3410   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3411   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3412   /* If aarch64_layout_frame has chosen registers to store/restore with
3413      writeback don't interfere with them to avoid having to output explicit
3414      stack adjustment instructions.  */
3415   if (reg2 != INVALID_REGNUM)
3416     bitmap_clear_bit (components, reg2);
3417   if (reg1 != INVALID_REGNUM)
3418     bitmap_clear_bit (components, reg1);
3419
3420   bitmap_clear_bit (components, LR_REGNUM);
3421   bitmap_clear_bit (components, SP_REGNUM);
3422
3423   return components;
3424 }
3425
3426 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3427
3428 static sbitmap
3429 aarch64_components_for_bb (basic_block bb)
3430 {
3431   bitmap in = DF_LIVE_IN (bb);
3432   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3433   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3434
3435   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3436   bitmap_clear (components);
3437
3438   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3439   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3440     if ((!call_used_regs[regno])
3441        && (bitmap_bit_p (in, regno)
3442            || bitmap_bit_p (gen, regno)
3443            || bitmap_bit_p (kill, regno)))
3444           bitmap_set_bit (components, regno);
3445
3446   return components;
3447 }
3448
3449 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3450    Nothing to do for aarch64.  */
3451
3452 static void
3453 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3454 {
3455 }
3456
3457 /* Return the next set bit in BMP from START onwards.  Return the total number
3458    of bits in BMP if no set bit is found at or after START.  */
3459
3460 static unsigned int
3461 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3462 {
3463   unsigned int nbits = SBITMAP_SIZE (bmp);
3464   if (start == nbits)
3465     return start;
3466
3467   gcc_assert (start < nbits);
3468   for (unsigned int i = start; i < nbits; i++)
3469     if (bitmap_bit_p (bmp, i))
3470       return i;
3471
3472   return nbits;
3473 }
3474
3475 /* Do the work for aarch64_emit_prologue_components and
3476    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3477    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3478    for these components or the epilogue sequence.  That is, it determines
3479    whether we should emit stores or loads and what kind of CFA notes to attach
3480    to the insns.  Otherwise the logic for the two sequences is very
3481    similar.  */
3482
3483 static void
3484 aarch64_process_components (sbitmap components, bool prologue_p)
3485 {
3486   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3487                              ? HARD_FRAME_POINTER_REGNUM
3488                              : STACK_POINTER_REGNUM);
3489
3490   unsigned last_regno = SBITMAP_SIZE (components);
3491   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3492   rtx_insn *insn = NULL;
3493
3494   while (regno != last_regno)
3495     {
3496       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3497          so DFmode for the vector registers is enough.  */
3498       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3499       rtx reg = gen_rtx_REG (mode, regno);
3500       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3501       if (!frame_pointer_needed)
3502         offset += cfun->machine->frame.frame_size
3503                   - cfun->machine->frame.hard_fp_offset;
3504       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3505       rtx mem = gen_frame_mem (mode, addr);
3506
3507       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3508       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3509       /* No more registers to handle after REGNO.
3510          Emit a single save/restore and exit.  */
3511       if (regno2 == last_regno)
3512         {
3513           insn = emit_insn (set);
3514           RTX_FRAME_RELATED_P (insn) = 1;
3515           if (prologue_p)
3516             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3517           else
3518             add_reg_note (insn, REG_CFA_RESTORE, reg);
3519           break;
3520         }
3521
3522       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3523       /* The next register is not of the same class or its offset is not
3524          mergeable with the current one into a pair.  */
3525       if (!satisfies_constraint_Ump (mem)
3526           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3527           || (offset2 - cfun->machine->frame.reg_offset[regno])
3528                 != GET_MODE_SIZE (mode))
3529         {
3530           insn = emit_insn (set);
3531           RTX_FRAME_RELATED_P (insn) = 1;
3532           if (prologue_p)
3533             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3534           else
3535             add_reg_note (insn, REG_CFA_RESTORE, reg);
3536
3537           regno = regno2;
3538           continue;
3539         }
3540
3541       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3542       rtx reg2 = gen_rtx_REG (mode, regno2);
3543       if (!frame_pointer_needed)
3544         offset2 += cfun->machine->frame.frame_size
3545                   - cfun->machine->frame.hard_fp_offset;
3546       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3547       rtx mem2 = gen_frame_mem (mode, addr2);
3548       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3549                              : gen_rtx_SET (reg2, mem2);
3550
3551       if (prologue_p)
3552         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3553       else
3554         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3555
3556       RTX_FRAME_RELATED_P (insn) = 1;
3557       if (prologue_p)
3558         {
3559           add_reg_note (insn, REG_CFA_OFFSET, set);
3560           add_reg_note (insn, REG_CFA_OFFSET, set2);
3561         }
3562       else
3563         {
3564           add_reg_note (insn, REG_CFA_RESTORE, reg);
3565           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3566         }
3567
3568       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3569     }
3570 }
3571
3572 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3573
3574 static void
3575 aarch64_emit_prologue_components (sbitmap components)
3576 {
3577   aarch64_process_components (components, true);
3578 }
3579
3580 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3581
3582 static void
3583 aarch64_emit_epilogue_components (sbitmap components)
3584 {
3585   aarch64_process_components (components, false);
3586 }
3587
3588 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3589
3590 static void
3591 aarch64_set_handled_components (sbitmap components)
3592 {
3593   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3594     if (bitmap_bit_p (components, regno))
3595       cfun->machine->reg_is_wrapped_separately[regno] = true;
3596 }
3597
3598 /* AArch64 stack frames generated by this compiler look like:
3599
3600         +-------------------------------+
3601         |                               |
3602         |  incoming stack arguments     |
3603         |                               |
3604         +-------------------------------+
3605         |                               | <-- incoming stack pointer (aligned)
3606         |  callee-allocated save area   |
3607         |  for register varargs         |
3608         |                               |
3609         +-------------------------------+
3610         |  local variables              | <-- frame_pointer_rtx
3611         |                               |
3612         +-------------------------------+
3613         |  padding0                     | \
3614         +-------------------------------+  |
3615         |  callee-saved registers       |  | frame.saved_regs_size
3616         +-------------------------------+  |
3617         |  LR'                          |  |
3618         +-------------------------------+  |
3619         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3620         +-------------------------------+
3621         |  dynamic allocation           |
3622         +-------------------------------+
3623         |  padding                      |
3624         +-------------------------------+
3625         |  outgoing stack arguments     | <-- arg_pointer
3626         |                               |
3627         +-------------------------------+
3628         |                               | <-- stack_pointer_rtx (aligned)
3629
3630    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3631    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3632    unchanged.  */
3633
3634 /* Generate the prologue instructions for entry into a function.
3635    Establish the stack frame by decreasing the stack pointer with a
3636    properly calculated size and, if necessary, create a frame record
3637    filled with the values of LR and previous frame pointer.  The
3638    current FP is also set up if it is in use.  */
3639
3640 void
3641 aarch64_expand_prologue (void)
3642 {
3643   aarch64_layout_frame ();
3644
3645   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3646   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3647   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3648   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3649   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3650   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3651   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3652   rtx_insn *insn;
3653
3654   /* Sign return address for functions.  */
3655   if (aarch64_return_address_signing_enabled ())
3656     {
3657       insn = emit_insn (gen_pacisp ());
3658       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3659       RTX_FRAME_RELATED_P (insn) = 1;
3660     }
3661
3662   if (flag_stack_usage_info)
3663     current_function_static_stack_size = frame_size;
3664
3665   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3666     {
3667       if (crtl->is_leaf && !cfun->calls_alloca)
3668         {
3669           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3670             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3671                                             frame_size - STACK_CHECK_PROTECT);
3672         }
3673       else if (frame_size > 0)
3674         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3675     }
3676
3677   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3678
3679   if (callee_adjust != 0)
3680     aarch64_push_regs (reg1, reg2, callee_adjust);
3681
3682   if (frame_pointer_needed)
3683     {
3684       if (callee_adjust == 0)
3685         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3686                                    R30_REGNUM, false);
3687       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3688                                        stack_pointer_rtx,
3689                                        GEN_INT (callee_offset)));
3690       RTX_FRAME_RELATED_P (insn) = 1;
3691       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3692     }
3693
3694   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3695                              callee_adjust != 0 || frame_pointer_needed);
3696   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3697                              callee_adjust != 0 || frame_pointer_needed);
3698   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3699 }
3700
3701 /* Return TRUE if we can use a simple_return insn.
3702
3703    This function checks whether the callee saved stack is empty, which
3704    means no restore actions are need. The pro_and_epilogue will use
3705    this to check whether shrink-wrapping opt is feasible.  */
3706
3707 bool
3708 aarch64_use_return_insn_p (void)
3709 {
3710   if (!reload_completed)
3711     return false;
3712
3713   if (crtl->profile)
3714     return false;
3715
3716   aarch64_layout_frame ();
3717
3718   return cfun->machine->frame.frame_size == 0;
3719 }
3720
3721 /* Generate the epilogue instructions for returning from a function.
3722    This is almost exactly the reverse of the prolog sequence, except
3723    that we need to insert barriers to avoid scheduling loads that read
3724    from a deallocated stack, and we optimize the unwind records by
3725    emitting them all together if possible.  */
3726 void
3727 aarch64_expand_epilogue (bool for_sibcall)
3728 {
3729   aarch64_layout_frame ();
3730
3731   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3732   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3733   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3734   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3735   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3736   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3737   rtx cfi_ops = NULL;
3738   rtx_insn *insn;
3739
3740   /* We need to add memory barrier to prevent read from deallocated stack.  */
3741   bool need_barrier_p = (get_frame_size ()
3742                          + cfun->machine->frame.saved_varargs_size) != 0;
3743
3744   /* Emit a barrier to prevent loads from a deallocated stack.  */
3745   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3746       || crtl->calls_eh_return)
3747     {
3748       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3749       need_barrier_p = false;
3750     }
3751
3752   /* Restore the stack pointer from the frame pointer if it may not
3753      be the same as the stack pointer.  */
3754   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3755     {
3756       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3757                                        hard_frame_pointer_rtx,
3758                                        GEN_INT (-callee_offset)));
3759       /* If writeback is used when restoring callee-saves, the CFA
3760          is restored on the instruction doing the writeback.  */
3761       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3762     }
3763   else
3764     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3765
3766   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3767                                 callee_adjust != 0, &cfi_ops);
3768   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3769                                 callee_adjust != 0, &cfi_ops);
3770
3771   if (need_barrier_p)
3772     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3773
3774   if (callee_adjust != 0)
3775     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3776
3777   if (callee_adjust != 0 || initial_adjust > 65536)
3778     {
3779       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3780       insn = get_last_insn ();
3781       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3782       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3783       RTX_FRAME_RELATED_P (insn) = 1;
3784       cfi_ops = NULL;
3785     }
3786
3787   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3788
3789   if (cfi_ops)
3790     {
3791       /* Emit delayed restores and reset the CFA to be SP.  */
3792       insn = get_last_insn ();
3793       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3794       REG_NOTES (insn) = cfi_ops;
3795       RTX_FRAME_RELATED_P (insn) = 1;
3796     }
3797
3798   /* We prefer to emit the combined return/authenticate instruction RETAA,
3799      however there are three cases in which we must instead emit an explicit
3800      authentication instruction.
3801
3802         1) Sibcalls don't return in a normal way, so if we're about to call one
3803            we must authenticate.
3804
3805         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3806            generating code for !TARGET_ARMV8_3 we can't use it and must
3807            explicitly authenticate.
3808
3809         3) On an eh_return path we make extra stack adjustments to update the
3810            canonical frame address to be the exception handler's CFA.  We want
3811            to authenticate using the CFA of the function which calls eh_return.
3812     */
3813   if (aarch64_return_address_signing_enabled ()
3814       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3815     {
3816       insn = emit_insn (gen_autisp ());
3817       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3818       RTX_FRAME_RELATED_P (insn) = 1;
3819     }
3820
3821   /* Stack adjustment for exception handler.  */
3822   if (crtl->calls_eh_return)
3823     {
3824       /* We need to unwind the stack by the offset computed by
3825          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3826          to be SP; letting the CFA move during this adjustment
3827          is just as correct as retaining the CFA from the body
3828          of the function.  Therefore, do nothing special.  */
3829       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3830     }
3831
3832   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3833   if (!for_sibcall)
3834     emit_jump_insn (ret_rtx);
3835 }
3836
3837 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3838    normally or return to a previous frame after unwinding.
3839
3840    An EH return uses a single shared return sequence.  The epilogue is
3841    exactly like a normal epilogue except that it has an extra input
3842    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3843    that must be applied after the frame has been destroyed.  An extra label
3844    is inserted before the epilogue which initializes this register to zero,
3845    and this is the entry point for a normal return.
3846
3847    An actual EH return updates the return address, initializes the stack
3848    adjustment and jumps directly into the epilogue (bypassing the zeroing
3849    of the adjustment).  Since the return address is typically saved on the
3850    stack when a function makes a call, the saved LR must be updated outside
3851    the epilogue.
3852
3853    This poses problems as the store is generated well before the epilogue,
3854    so the offset of LR is not known yet.  Also optimizations will remove the
3855    store as it appears dead, even after the epilogue is generated (as the
3856    base or offset for loading LR is different in many cases).
3857
3858    To avoid these problems this implementation forces the frame pointer
3859    in eh_return functions so that the location of LR is fixed and known early.
3860    It also marks the store volatile, so no optimization is permitted to
3861    remove the store.  */
3862 rtx
3863 aarch64_eh_return_handler_rtx (void)
3864 {
3865   rtx tmp = gen_frame_mem (Pmode,
3866     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3867
3868   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3869   MEM_VOLATILE_P (tmp) = true;
3870   return tmp;
3871 }
3872
3873 /* Output code to add DELTA to the first argument, and then jump
3874    to FUNCTION.  Used for C++ multiple inheritance.  */
3875 static void
3876 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3877                          HOST_WIDE_INT delta,
3878                          HOST_WIDE_INT vcall_offset,
3879                          tree function)
3880 {
3881   /* The this pointer is always in x0.  Note that this differs from
3882      Arm where the this pointer maybe bumped to r1 if r0 is required
3883      to return a pointer to an aggregate.  On AArch64 a result value
3884      pointer will be in x8.  */
3885   int this_regno = R0_REGNUM;
3886   rtx this_rtx, temp0, temp1, addr, funexp;
3887   rtx_insn *insn;
3888
3889   reload_completed = 1;
3890   emit_note (NOTE_INSN_PROLOGUE_END);
3891
3892   if (vcall_offset == 0)
3893     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3894   else
3895     {
3896       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3897
3898       this_rtx = gen_rtx_REG (Pmode, this_regno);
3899       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3900       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3901
3902       addr = this_rtx;
3903       if (delta != 0)
3904         {
3905           if (delta >= -256 && delta < 256)
3906             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3907                                        plus_constant (Pmode, this_rtx, delta));
3908           else
3909             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3910         }
3911
3912       if (Pmode == ptr_mode)
3913         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3914       else
3915         aarch64_emit_move (temp0,
3916                            gen_rtx_ZERO_EXTEND (Pmode,
3917                                                 gen_rtx_MEM (ptr_mode, addr)));
3918
3919       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3920           addr = plus_constant (Pmode, temp0, vcall_offset);
3921       else
3922         {
3923           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3924                                           Pmode);
3925           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3926         }
3927
3928       if (Pmode == ptr_mode)
3929         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3930       else
3931         aarch64_emit_move (temp1,
3932                            gen_rtx_SIGN_EXTEND (Pmode,
3933                                                 gen_rtx_MEM (ptr_mode, addr)));
3934
3935       emit_insn (gen_add2_insn (this_rtx, temp1));
3936     }
3937
3938   /* Generate a tail call to the target function.  */
3939   if (!TREE_USED (function))
3940     {
3941       assemble_external (function);
3942       TREE_USED (function) = 1;
3943     }
3944   funexp = XEXP (DECL_RTL (function), 0);
3945   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3946   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3947   SIBLING_CALL_P (insn) = 1;
3948
3949   insn = get_insns ();
3950   shorten_branches (insn);
3951   final_start_function (insn, file, 1);
3952   final (insn, file, 1);
3953   final_end_function ();
3954
3955   /* Stop pretending to be a post-reload pass.  */
3956   reload_completed = 0;
3957 }
3958
3959 static bool
3960 aarch64_tls_referenced_p (rtx x)
3961 {
3962   if (!TARGET_HAVE_TLS)
3963     return false;
3964   subrtx_iterator::array_type array;
3965   FOR_EACH_SUBRTX (iter, array, x, ALL)
3966     {
3967       const_rtx x = *iter;
3968       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3969         return true;
3970       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3971          TLS offsets, not real symbol references.  */
3972       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3973         iter.skip_subrtxes ();
3974     }
3975   return false;
3976 }
3977
3978
3979 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3980    a left shift of 0 or 12 bits.  */
3981 bool
3982 aarch64_uimm12_shift (HOST_WIDE_INT val)
3983 {
3984   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3985           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3986           );
3987 }
3988
3989
3990 /* Return true if val is an immediate that can be loaded into a
3991    register by a MOVZ instruction.  */
3992 static bool
3993 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
3994 {
3995   if (GET_MODE_SIZE (mode) > 4)
3996     {
3997       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3998           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3999         return 1;
4000     }
4001   else
4002     {
4003       /* Ignore sign extension.  */
4004       val &= (HOST_WIDE_INT) 0xffffffff;
4005     }
4006   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4007           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4008 }
4009
4010 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4011
4012 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4013   {
4014     0x0000000100000001ull,
4015     0x0001000100010001ull,
4016     0x0101010101010101ull,
4017     0x1111111111111111ull,
4018     0x5555555555555555ull,
4019   };
4020
4021
4022 /* Return true if val is a valid bitmask immediate.  */
4023
4024 bool
4025 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4026 {
4027   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4028   int bits;
4029
4030   /* Check for a single sequence of one bits and return quickly if so.
4031      The special cases of all ones and all zeroes returns false.  */
4032   val = (unsigned HOST_WIDE_INT) val_in;
4033   tmp = val + (val & -val);
4034
4035   if (tmp == (tmp & -tmp))
4036     return (val + 1) > 1;
4037
4038   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4039   if (mode == SImode)
4040     val = (val << 32) | (val & 0xffffffff);
4041
4042   /* Invert if the immediate doesn't start with a zero bit - this means we
4043      only need to search for sequences of one bits.  */
4044   if (val & 1)
4045     val = ~val;
4046
4047   /* Find the first set bit and set tmp to val with the first sequence of one
4048      bits removed.  Return success if there is a single sequence of ones.  */
4049   first_one = val & -val;
4050   tmp = val & (val + first_one);
4051
4052   if (tmp == 0)
4053     return true;
4054
4055   /* Find the next set bit and compute the difference in bit position.  */
4056   next_one = tmp & -tmp;
4057   bits = clz_hwi (first_one) - clz_hwi (next_one);
4058   mask = val ^ tmp;
4059
4060   /* Check the bit position difference is a power of 2, and that the first
4061      sequence of one bits fits within 'bits' bits.  */
4062   if ((mask >> bits) != 0 || bits != (bits & -bits))
4063     return false;
4064
4065   /* Check the sequence of one bits is repeated 64/bits times.  */
4066   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4067 }
4068
4069 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4070    Assumed precondition: VAL_IN Is not zero.  */
4071
4072 unsigned HOST_WIDE_INT
4073 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4074 {
4075   int lowest_bit_set = ctz_hwi (val_in);
4076   int highest_bit_set = floor_log2 (val_in);
4077   gcc_assert (val_in != 0);
4078
4079   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4080           (HOST_WIDE_INT_1U << lowest_bit_set));
4081 }
4082
4083 /* Create constant where bits outside of lowest bit set to highest bit set
4084    are set to 1.  */
4085
4086 unsigned HOST_WIDE_INT
4087 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4088 {
4089   return val_in | ~aarch64_and_split_imm1 (val_in);
4090 }
4091
4092 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4093
4094 bool
4095 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4096 {
4097   scalar_int_mode int_mode;
4098   if (!is_a <scalar_int_mode> (mode, &int_mode))
4099     return false;
4100
4101   if (aarch64_bitmask_imm (val_in, int_mode))
4102     return false;
4103
4104   if (aarch64_move_imm (val_in, int_mode))
4105     return false;
4106
4107   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4108
4109   return aarch64_bitmask_imm (imm2, int_mode);
4110 }
4111
4112 /* Return true if val is an immediate that can be loaded into a
4113    register in a single instruction.  */
4114 bool
4115 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4116 {
4117   scalar_int_mode int_mode;
4118   if (!is_a <scalar_int_mode> (mode, &int_mode))
4119     return false;
4120
4121   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4122     return 1;
4123   return aarch64_bitmask_imm (val, int_mode);
4124 }
4125
4126 static bool
4127 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4128 {
4129   rtx base, offset;
4130
4131   if (GET_CODE (x) == HIGH)
4132     return true;
4133
4134   split_const (x, &base, &offset);
4135   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4136     {
4137       if (aarch64_classify_symbol (base, offset)
4138           != SYMBOL_FORCE_TO_MEM)
4139         return true;
4140       else
4141         /* Avoid generating a 64-bit relocation in ILP32; leave
4142            to aarch64_expand_mov_immediate to handle it properly.  */
4143         return mode != ptr_mode;
4144     }
4145
4146   return aarch64_tls_referenced_p (x);
4147 }
4148
4149 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4150    The expansion for a table switch is quite expensive due to the number
4151    of instructions, the table lookup and hard to predict indirect jump.
4152    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4153    set, otherwise use tables for > 16 cases as a tradeoff between size and
4154    performance.  When optimizing for size, use the default setting.  */
4155
4156 static unsigned int
4157 aarch64_case_values_threshold (void)
4158 {
4159   /* Use the specified limit for the number of cases before using jump
4160      tables at higher optimization levels.  */
4161   if (optimize > 2
4162       && selected_cpu->tune->max_case_values != 0)
4163     return selected_cpu->tune->max_case_values;
4164   else
4165     return optimize_size ? default_case_values_threshold () : 17;
4166 }
4167
4168 /* Return true if register REGNO is a valid index register.
4169    STRICT_P is true if REG_OK_STRICT is in effect.  */
4170
4171 bool
4172 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4173 {
4174   if (!HARD_REGISTER_NUM_P (regno))
4175     {
4176       if (!strict_p)
4177         return true;
4178
4179       if (!reg_renumber)
4180         return false;
4181
4182       regno = reg_renumber[regno];
4183     }
4184   return GP_REGNUM_P (regno);
4185 }
4186
4187 /* Return true if register REGNO is a valid base register for mode MODE.
4188    STRICT_P is true if REG_OK_STRICT is in effect.  */
4189
4190 bool
4191 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4192 {
4193   if (!HARD_REGISTER_NUM_P (regno))
4194     {
4195       if (!strict_p)
4196         return true;
4197
4198       if (!reg_renumber)
4199         return false;
4200
4201       regno = reg_renumber[regno];
4202     }
4203
4204   /* The fake registers will be eliminated to either the stack or
4205      hard frame pointer, both of which are usually valid base registers.
4206      Reload deals with the cases where the eliminated form isn't valid.  */
4207   return (GP_REGNUM_P (regno)
4208           || regno == SP_REGNUM
4209           || regno == FRAME_POINTER_REGNUM
4210           || regno == ARG_POINTER_REGNUM);
4211 }
4212
4213 /* Return true if X is a valid base register for mode MODE.
4214    STRICT_P is true if REG_OK_STRICT is in effect.  */
4215
4216 static bool
4217 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4218 {
4219   if (!strict_p
4220       && GET_CODE (x) == SUBREG
4221       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4222     x = SUBREG_REG (x);
4223
4224   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4225 }
4226
4227 /* Return true if address offset is a valid index.  If it is, fill in INFO
4228    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4229
4230 static bool
4231 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4232                         machine_mode mode, bool strict_p)
4233 {
4234   enum aarch64_address_type type;
4235   rtx index;
4236   int shift;
4237
4238   /* (reg:P) */
4239   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4240       && GET_MODE (x) == Pmode)
4241     {
4242       type = ADDRESS_REG_REG;
4243       index = x;
4244       shift = 0;
4245     }
4246   /* (sign_extend:DI (reg:SI)) */
4247   else if ((GET_CODE (x) == SIGN_EXTEND
4248             || GET_CODE (x) == ZERO_EXTEND)
4249            && GET_MODE (x) == DImode
4250            && GET_MODE (XEXP (x, 0)) == SImode)
4251     {
4252       type = (GET_CODE (x) == SIGN_EXTEND)
4253         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4254       index = XEXP (x, 0);
4255       shift = 0;
4256     }
4257   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4258   else if (GET_CODE (x) == MULT
4259            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4260                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4261            && GET_MODE (XEXP (x, 0)) == DImode
4262            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4263            && CONST_INT_P (XEXP (x, 1)))
4264     {
4265       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4266         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267       index = XEXP (XEXP (x, 0), 0);
4268       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4269     }
4270   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4271   else if (GET_CODE (x) == ASHIFT
4272            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4273                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4274            && GET_MODE (XEXP (x, 0)) == DImode
4275            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4276            && CONST_INT_P (XEXP (x, 1)))
4277     {
4278       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4279         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280       index = XEXP (XEXP (x, 0), 0);
4281       shift = INTVAL (XEXP (x, 1));
4282     }
4283   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4284   else if ((GET_CODE (x) == SIGN_EXTRACT
4285             || GET_CODE (x) == ZERO_EXTRACT)
4286            && GET_MODE (x) == DImode
4287            && GET_CODE (XEXP (x, 0)) == MULT
4288            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4289            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4290     {
4291       type = (GET_CODE (x) == SIGN_EXTRACT)
4292         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293       index = XEXP (XEXP (x, 0), 0);
4294       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4295       if (INTVAL (XEXP (x, 1)) != 32 + shift
4296           || INTVAL (XEXP (x, 2)) != 0)
4297         shift = -1;
4298     }
4299   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4300      (const_int 0xffffffff<<shift)) */
4301   else if (GET_CODE (x) == AND
4302            && GET_MODE (x) == DImode
4303            && GET_CODE (XEXP (x, 0)) == MULT
4304            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4305            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4306            && CONST_INT_P (XEXP (x, 1)))
4307     {
4308       type = ADDRESS_REG_UXTW;
4309       index = XEXP (XEXP (x, 0), 0);
4310       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4311       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4312         shift = -1;
4313     }
4314   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4315   else if ((GET_CODE (x) == SIGN_EXTRACT
4316             || GET_CODE (x) == ZERO_EXTRACT)
4317            && GET_MODE (x) == DImode
4318            && GET_CODE (XEXP (x, 0)) == ASHIFT
4319            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4320            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4321     {
4322       type = (GET_CODE (x) == SIGN_EXTRACT)
4323         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4324       index = XEXP (XEXP (x, 0), 0);
4325       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4326       if (INTVAL (XEXP (x, 1)) != 32 + shift
4327           || INTVAL (XEXP (x, 2)) != 0)
4328         shift = -1;
4329     }
4330   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4331      (const_int 0xffffffff<<shift)) */
4332   else if (GET_CODE (x) == AND
4333            && GET_MODE (x) == DImode
4334            && GET_CODE (XEXP (x, 0)) == ASHIFT
4335            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4336            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4337            && CONST_INT_P (XEXP (x, 1)))
4338     {
4339       type = ADDRESS_REG_UXTW;
4340       index = XEXP (XEXP (x, 0), 0);
4341       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4342       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4343         shift = -1;
4344     }
4345   /* (mult:P (reg:P) (const_int scale)) */
4346   else if (GET_CODE (x) == MULT
4347            && GET_MODE (x) == Pmode
4348            && GET_MODE (XEXP (x, 0)) == Pmode
4349            && CONST_INT_P (XEXP (x, 1)))
4350     {
4351       type = ADDRESS_REG_REG;
4352       index = XEXP (x, 0);
4353       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4354     }
4355   /* (ashift:P (reg:P) (const_int shift)) */
4356   else if (GET_CODE (x) == ASHIFT
4357            && GET_MODE (x) == Pmode
4358            && GET_MODE (XEXP (x, 0)) == Pmode
4359            && CONST_INT_P (XEXP (x, 1)))
4360     {
4361       type = ADDRESS_REG_REG;
4362       index = XEXP (x, 0);
4363       shift = INTVAL (XEXP (x, 1));
4364     }
4365   else
4366     return false;
4367
4368   if (!strict_p
4369       && GET_CODE (index) == SUBREG
4370       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4371     index = SUBREG_REG (index);
4372
4373   if ((shift == 0 ||
4374        (shift > 0 && shift <= 3
4375         && (1 << shift) == GET_MODE_SIZE (mode)))
4376       && REG_P (index)
4377       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4378     {
4379       info->type = type;
4380       info->offset = index;
4381       info->shift = shift;
4382       return true;
4383     }
4384
4385   return false;
4386 }
4387
4388 /* Return true if MODE is one of the modes for which we
4389    support LDP/STP operations.  */
4390
4391 static bool
4392 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4393 {
4394   return mode == SImode || mode == DImode
4395          || mode == SFmode || mode == DFmode
4396          || (aarch64_vector_mode_supported_p (mode)
4397              && GET_MODE_SIZE (mode) == 8);
4398 }
4399
4400 /* Return true if REGNO is a virtual pointer register, or an eliminable
4401    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4402    include stack_pointer or hard_frame_pointer.  */
4403 static bool
4404 virt_or_elim_regno_p (unsigned regno)
4405 {
4406   return ((regno >= FIRST_VIRTUAL_REGISTER
4407            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4408           || regno == FRAME_POINTER_REGNUM
4409           || regno == ARG_POINTER_REGNUM);
4410 }
4411
4412 /* Return true if X is a valid address for machine mode MODE.  If it is,
4413    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4414    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4415
4416 static bool
4417 aarch64_classify_address (struct aarch64_address_info *info,
4418                           rtx x, machine_mode mode,
4419                           RTX_CODE outer_code, bool strict_p)
4420 {
4421   enum rtx_code code = GET_CODE (x);
4422   rtx op0, op1;
4423
4424   /* On BE, we use load/store pair for all large int mode load/stores.
4425      TI/TFmode may also use a load/store pair.  */
4426   bool load_store_pair_p = (outer_code == PARALLEL
4427                             || mode == TImode
4428                             || mode == TFmode
4429                             || (BYTES_BIG_ENDIAN
4430                                 && aarch64_vect_struct_mode_p (mode)));
4431
4432   bool allow_reg_index_p =
4433     !load_store_pair_p
4434     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4435     && !aarch64_vect_struct_mode_p (mode);
4436
4437   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4438      REG addressing.  */
4439   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4440       && (code != POST_INC && code != REG))
4441     return false;
4442
4443   switch (code)
4444     {
4445     case REG:
4446     case SUBREG:
4447       info->type = ADDRESS_REG_IMM;
4448       info->base = x;
4449       info->offset = const0_rtx;
4450       return aarch64_base_register_rtx_p (x, strict_p);
4451
4452     case PLUS:
4453       op0 = XEXP (x, 0);
4454       op1 = XEXP (x, 1);
4455
4456       if (! strict_p
4457           && REG_P (op0)
4458           && virt_or_elim_regno_p (REGNO (op0))
4459           && CONST_INT_P (op1))
4460         {
4461           info->type = ADDRESS_REG_IMM;
4462           info->base = op0;
4463           info->offset = op1;
4464
4465           return true;
4466         }
4467
4468       if (GET_MODE_SIZE (mode) != 0
4469           && CONST_INT_P (op1)
4470           && aarch64_base_register_rtx_p (op0, strict_p))
4471         {
4472           HOST_WIDE_INT offset = INTVAL (op1);
4473
4474           info->type = ADDRESS_REG_IMM;
4475           info->base = op0;
4476           info->offset = op1;
4477
4478           /* TImode and TFmode values are allowed in both pairs of X
4479              registers and individual Q registers.  The available
4480              address modes are:
4481              X,X: 7-bit signed scaled offset
4482              Q:   9-bit signed offset
4483              We conservatively require an offset representable in either mode.
4484              When performing the check for pairs of X registers i.e.  LDP/STP
4485              pass down DImode since that is the natural size of the LDP/STP
4486              instruction memory accesses.  */
4487           if (mode == TImode || mode == TFmode)
4488             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4489                     && (offset_9bit_signed_unscaled_p (mode, offset)
4490                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4491
4492           /* A 7bit offset check because OImode will emit a ldp/stp
4493              instruction (only big endian will get here).
4494              For ldp/stp instructions, the offset is scaled for the size of a
4495              single element of the pair.  */
4496           if (mode == OImode)
4497             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4498
4499           /* Three 9/12 bit offsets checks because CImode will emit three
4500              ldr/str instructions (only big endian will get here).  */
4501           if (mode == CImode)
4502             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4503                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4504                         || offset_12bit_unsigned_scaled_p (V16QImode,
4505                                                            offset + 32)));
4506
4507           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4508              instructions (only big endian will get here).  */
4509           if (mode == XImode)
4510             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4511                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4512                                                             offset + 32));
4513
4514           if (load_store_pair_p)
4515             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4516                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4517           else
4518             return (offset_9bit_signed_unscaled_p (mode, offset)
4519                     || offset_12bit_unsigned_scaled_p (mode, offset));
4520         }
4521
4522       if (allow_reg_index_p)
4523         {
4524           /* Look for base + (scaled/extended) index register.  */
4525           if (aarch64_base_register_rtx_p (op0, strict_p)
4526               && aarch64_classify_index (info, op1, mode, strict_p))
4527             {
4528               info->base = op0;
4529               return true;
4530             }
4531           if (aarch64_base_register_rtx_p (op1, strict_p)
4532               && aarch64_classify_index (info, op0, mode, strict_p))
4533             {
4534               info->base = op1;
4535               return true;
4536             }
4537         }
4538
4539       return false;
4540
4541     case POST_INC:
4542     case POST_DEC:
4543     case PRE_INC:
4544     case PRE_DEC:
4545       info->type = ADDRESS_REG_WB;
4546       info->base = XEXP (x, 0);
4547       info->offset = NULL_RTX;
4548       return aarch64_base_register_rtx_p (info->base, strict_p);
4549
4550     case POST_MODIFY:
4551     case PRE_MODIFY:
4552       info->type = ADDRESS_REG_WB;
4553       info->base = XEXP (x, 0);
4554       if (GET_CODE (XEXP (x, 1)) == PLUS
4555           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4556           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4557           && aarch64_base_register_rtx_p (info->base, strict_p))
4558         {
4559           HOST_WIDE_INT offset;
4560           info->offset = XEXP (XEXP (x, 1), 1);
4561           offset = INTVAL (info->offset);
4562
4563           /* TImode and TFmode values are allowed in both pairs of X
4564              registers and individual Q registers.  The available
4565              address modes are:
4566              X,X: 7-bit signed scaled offset
4567              Q:   9-bit signed offset
4568              We conservatively require an offset representable in either mode.
4569            */
4570           if (mode == TImode || mode == TFmode)
4571             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4572                     && offset_9bit_signed_unscaled_p (mode, offset));
4573
4574           if (load_store_pair_p)
4575             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4576                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4577           else
4578             return offset_9bit_signed_unscaled_p (mode, offset);
4579         }
4580       return false;
4581
4582     case CONST:
4583     case SYMBOL_REF:
4584     case LABEL_REF:
4585       /* load literal: pc-relative constant pool entry.  Only supported
4586          for SI mode or larger.  */
4587       info->type = ADDRESS_SYMBOLIC;
4588
4589       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4590         {
4591           rtx sym, addend;
4592
4593           split_const (x, &sym, &addend);
4594           return ((GET_CODE (sym) == LABEL_REF
4595                    || (GET_CODE (sym) == SYMBOL_REF
4596                        && CONSTANT_POOL_ADDRESS_P (sym)
4597                        && aarch64_pcrelative_literal_loads)));
4598         }
4599       return false;
4600
4601     case LO_SUM:
4602       info->type = ADDRESS_LO_SUM;
4603       info->base = XEXP (x, 0);
4604       info->offset = XEXP (x, 1);
4605       if (allow_reg_index_p
4606           && aarch64_base_register_rtx_p (info->base, strict_p))
4607         {
4608           rtx sym, offs;
4609           split_const (info->offset, &sym, &offs);
4610           if (GET_CODE (sym) == SYMBOL_REF
4611               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4612             {
4613               /* The symbol and offset must be aligned to the access size.  */
4614               unsigned int align;
4615               unsigned int ref_size;
4616
4617               if (CONSTANT_POOL_ADDRESS_P (sym))
4618                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4619               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4620                 {
4621                   tree exp = SYMBOL_REF_DECL (sym);
4622                   align = TYPE_ALIGN (TREE_TYPE (exp));
4623                   align = CONSTANT_ALIGNMENT (exp, align);
4624                 }
4625               else if (SYMBOL_REF_DECL (sym))
4626                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4627               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4628                        && SYMBOL_REF_BLOCK (sym) != NULL)
4629                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4630               else
4631                 align = BITS_PER_UNIT;
4632
4633               ref_size = GET_MODE_SIZE (mode);
4634               if (ref_size == 0)
4635                 ref_size = GET_MODE_SIZE (DImode);
4636
4637               return ((INTVAL (offs) & (ref_size - 1)) == 0
4638                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4639             }
4640         }
4641       return false;
4642
4643     default:
4644       return false;
4645     }
4646 }
4647
4648 /* Return true if the address X is valid for a PRFM instruction.
4649    STRICT_P is true if we should do strict checking with
4650    aarch64_classify_address.  */
4651
4652 bool
4653 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4654 {
4655   struct aarch64_address_info addr;
4656
4657   /* PRFM accepts the same addresses as DImode...  */
4658   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4659   if (!res)
4660     return false;
4661
4662   /* ... except writeback forms.  */
4663   return addr.type != ADDRESS_REG_WB;
4664 }
4665
4666 bool
4667 aarch64_symbolic_address_p (rtx x)
4668 {
4669   rtx offset;
4670
4671   split_const (x, &x, &offset);
4672   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4673 }
4674
4675 /* Classify the base of symbolic expression X.  */
4676
4677 enum aarch64_symbol_type
4678 aarch64_classify_symbolic_expression (rtx x)
4679 {
4680   rtx offset;
4681
4682   split_const (x, &x, &offset);
4683   return aarch64_classify_symbol (x, offset);
4684 }
4685
4686
4687 /* Return TRUE if X is a legitimate address for accessing memory in
4688    mode MODE.  */
4689 static bool
4690 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4691 {
4692   struct aarch64_address_info addr;
4693
4694   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4695 }
4696
4697 /* Return TRUE if X is a legitimate address for accessing memory in
4698    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4699    pair operation.  */
4700 bool
4701 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4702                               RTX_CODE outer_code, bool strict_p)
4703 {
4704   struct aarch64_address_info addr;
4705
4706   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4707 }
4708
4709 /* Split an out-of-range address displacement into a base and offset.
4710    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4711    to increase opportunities for sharing the base address of different sizes.
4712    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4713 static bool
4714 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4715 {
4716   HOST_WIDE_INT offset = INTVAL (*disp);
4717   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4718
4719   if (mode == TImode || mode == TFmode
4720       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4721     base = (offset + 0x100) & ~0x1ff;
4722
4723   *off = GEN_INT (base);
4724   *disp = GEN_INT (offset - base);
4725   return true;
4726 }
4727
4728 /* Return the binary representation of floating point constant VALUE in INTVAL.
4729    If the value cannot be converted, return false without setting INTVAL.
4730    The conversion is done in the given MODE.  */
4731 bool
4732 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4733 {
4734
4735   /* We make a general exception for 0.  */
4736   if (aarch64_float_const_zero_rtx_p (value))
4737     {
4738       *intval = 0;
4739       return true;
4740     }
4741
4742   machine_mode mode = GET_MODE (value);
4743   if (GET_CODE (value) != CONST_DOUBLE
4744       || !SCALAR_FLOAT_MODE_P (mode)
4745       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4746       /* Only support up to DF mode.  */
4747       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4748     return false;
4749
4750   unsigned HOST_WIDE_INT ival = 0;
4751
4752   long res[2];
4753   real_to_target (res,
4754                   CONST_DOUBLE_REAL_VALUE (value),
4755                   REAL_MODE_FORMAT (mode));
4756
4757   if (mode == DFmode)
4758     {
4759       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4760       ival = zext_hwi (res[order], 32);
4761       ival |= (zext_hwi (res[1 - order], 32) << 32);
4762     }
4763   else
4764       ival = zext_hwi (res[0], 32);
4765
4766   *intval = ival;
4767   return true;
4768 }
4769
4770 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4771    single MOV(+MOVK) followed by an FMOV.  */
4772 bool
4773 aarch64_float_const_rtx_p (rtx x)
4774 {
4775   machine_mode mode = GET_MODE (x);
4776   if (mode == VOIDmode)
4777     return false;
4778
4779   /* Determine whether it's cheaper to write float constants as
4780      mov/movk pairs over ldr/adrp pairs.  */
4781   unsigned HOST_WIDE_INT ival;
4782
4783   if (GET_CODE (x) == CONST_DOUBLE
4784       && SCALAR_FLOAT_MODE_P (mode)
4785       && aarch64_reinterpret_float_as_int (x, &ival))
4786     {
4787       scalar_int_mode imode = (mode == HFmode
4788                                ? SImode
4789                                : int_mode_for_mode (mode).require ());
4790       int num_instr = aarch64_internal_mov_immediate
4791                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4792       return num_instr < 3;
4793     }
4794
4795   return false;
4796 }
4797
4798 /* Return TRUE if rtx X is immediate constant 0.0 */
4799 bool
4800 aarch64_float_const_zero_rtx_p (rtx x)
4801 {
4802   if (GET_MODE (x) == VOIDmode)
4803     return false;
4804
4805   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4806     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4807   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4808 }
4809
4810 /* Return TRUE if rtx X is immediate constant that fits in a single
4811    MOVI immediate operation.  */
4812 bool
4813 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4814 {
4815   if (!TARGET_SIMD)
4816      return false;
4817
4818   machine_mode vmode;
4819   scalar_int_mode imode;
4820   unsigned HOST_WIDE_INT ival;
4821
4822   if (GET_CODE (x) == CONST_DOUBLE
4823       && SCALAR_FLOAT_MODE_P (mode))
4824     {
4825       if (!aarch64_reinterpret_float_as_int (x, &ival))
4826         return false;
4827
4828       /* We make a general exception for 0.  */
4829       if (aarch64_float_const_zero_rtx_p (x))
4830         return true;
4831
4832       imode = int_mode_for_mode (mode).require ();
4833     }
4834   else if (GET_CODE (x) == CONST_INT
4835            && is_a <scalar_int_mode> (mode, &imode))
4836     ival = INTVAL (x);
4837   else
4838     return false;
4839
4840    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4841      a 128 bit vector mode.  */
4842   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4843
4844   vmode = aarch64_simd_container_mode (imode, width);
4845   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4846
4847   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4848 }
4849
4850
4851 /* Return the fixed registers used for condition codes.  */
4852
4853 static bool
4854 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4855 {
4856   *p1 = CC_REGNUM;
4857   *p2 = INVALID_REGNUM;
4858   return true;
4859 }
4860
4861 /* This function is used by the call expanders of the machine description.
4862    RESULT is the register in which the result is returned.  It's NULL for
4863    "call" and "sibcall".
4864    MEM is the location of the function call.
4865    SIBCALL indicates whether this function call is normal call or sibling call.
4866    It will generate different pattern accordingly.  */
4867
4868 void
4869 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4870 {
4871   rtx call, callee, tmp;
4872   rtvec vec;
4873   machine_mode mode;
4874
4875   gcc_assert (MEM_P (mem));
4876   callee = XEXP (mem, 0);
4877   mode = GET_MODE (callee);
4878   gcc_assert (mode == Pmode);
4879
4880   /* Decide if we should generate indirect calls by loading the
4881      address of the callee into a register before performing
4882      the branch-and-link.  */
4883   if (SYMBOL_REF_P (callee)
4884       ? (aarch64_is_long_call_p (callee)
4885          || aarch64_is_noplt_call_p (callee))
4886       : !REG_P (callee))
4887     XEXP (mem, 0) = force_reg (mode, callee);
4888
4889   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4890
4891   if (result != NULL_RTX)
4892     call = gen_rtx_SET (result, call);
4893
4894   if (sibcall)
4895     tmp = ret_rtx;
4896   else
4897     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4898
4899   vec = gen_rtvec (2, call, tmp);
4900   call = gen_rtx_PARALLEL (VOIDmode, vec);
4901
4902   aarch64_emit_call_insn (call);
4903 }
4904
4905 /* Emit call insn with PAT and do aarch64-specific handling.  */
4906
4907 void
4908 aarch64_emit_call_insn (rtx pat)
4909 {
4910   rtx insn = emit_call_insn (pat);
4911
4912   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4913   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4914   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4915 }
4916
4917 machine_mode
4918 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4919 {
4920   /* All floating point compares return CCFP if it is an equality
4921      comparison, and CCFPE otherwise.  */
4922   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4923     {
4924       switch (code)
4925         {
4926         case EQ:
4927         case NE:
4928         case UNORDERED:
4929         case ORDERED:
4930         case UNLT:
4931         case UNLE:
4932         case UNGT:
4933         case UNGE:
4934         case UNEQ:
4935         case LTGT:
4936           return CCFPmode;
4937
4938         case LT:
4939         case LE:
4940         case GT:
4941         case GE:
4942           return CCFPEmode;
4943
4944         default:
4945           gcc_unreachable ();
4946         }
4947     }
4948
4949   /* Equality comparisons of short modes against zero can be performed
4950      using the TST instruction with the appropriate bitmask.  */
4951   if (y == const0_rtx && REG_P (x)
4952       && (code == EQ || code == NE)
4953       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4954     return CC_NZmode;
4955
4956   /* Similarly, comparisons of zero_extends from shorter modes can
4957      be performed using an ANDS with an immediate mask.  */
4958   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4959       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4960       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4961       && (code == EQ || code == NE))
4962     return CC_NZmode;
4963
4964   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4965       && y == const0_rtx
4966       && (code == EQ || code == NE || code == LT || code == GE)
4967       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4968           || GET_CODE (x) == NEG
4969           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4970               && CONST_INT_P (XEXP (x, 2)))))
4971     return CC_NZmode;
4972
4973   /* A compare with a shifted operand.  Because of canonicalization,
4974      the comparison will have to be swapped when we emit the assembly
4975      code.  */
4976   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4977       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4978       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4979           || GET_CODE (x) == LSHIFTRT
4980           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4981     return CC_SWPmode;
4982
4983   /* Similarly for a negated operand, but we can only do this for
4984      equalities.  */
4985   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4986       && (REG_P (y) || GET_CODE (y) == SUBREG)
4987       && (code == EQ || code == NE)
4988       && GET_CODE (x) == NEG)
4989     return CC_Zmode;
4990
4991   /* A test for unsigned overflow.  */
4992   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4993       && code == NE
4994       && GET_CODE (x) == PLUS
4995       && GET_CODE (y) == ZERO_EXTEND)
4996     return CC_Cmode;
4997
4998   /* For everything else, return CCmode.  */
4999   return CCmode;
5000 }
5001
5002 static int
5003 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5004
5005 int
5006 aarch64_get_condition_code (rtx x)
5007 {
5008   machine_mode mode = GET_MODE (XEXP (x, 0));
5009   enum rtx_code comp_code = GET_CODE (x);
5010
5011   if (GET_MODE_CLASS (mode) != MODE_CC)
5012     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5013   return aarch64_get_condition_code_1 (mode, comp_code);
5014 }
5015
5016 static int
5017 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5018 {
5019   switch (mode)
5020     {
5021     case E_CCFPmode:
5022     case E_CCFPEmode:
5023       switch (comp_code)
5024         {
5025         case GE: return AARCH64_GE;
5026         case GT: return AARCH64_GT;
5027         case LE: return AARCH64_LS;
5028         case LT: return AARCH64_MI;
5029         case NE: return AARCH64_NE;
5030         case EQ: return AARCH64_EQ;
5031         case ORDERED: return AARCH64_VC;
5032         case UNORDERED: return AARCH64_VS;
5033         case UNLT: return AARCH64_LT;
5034         case UNLE: return AARCH64_LE;
5035         case UNGT: return AARCH64_HI;
5036         case UNGE: return AARCH64_PL;
5037         default: return -1;
5038         }
5039       break;
5040
5041     case E_CCmode:
5042       switch (comp_code)
5043         {
5044         case NE: return AARCH64_NE;
5045         case EQ: return AARCH64_EQ;
5046         case GE: return AARCH64_GE;
5047         case GT: return AARCH64_GT;
5048         case LE: return AARCH64_LE;
5049         case LT: return AARCH64_LT;
5050         case GEU: return AARCH64_CS;
5051         case GTU: return AARCH64_HI;
5052         case LEU: return AARCH64_LS;
5053         case LTU: return AARCH64_CC;
5054         default: return -1;
5055         }
5056       break;
5057
5058     case E_CC_SWPmode:
5059       switch (comp_code)
5060         {
5061         case NE: return AARCH64_NE;
5062         case EQ: return AARCH64_EQ;
5063         case GE: return AARCH64_LE;
5064         case GT: return AARCH64_LT;
5065         case LE: return AARCH64_GE;
5066         case LT: return AARCH64_GT;
5067         case GEU: return AARCH64_LS;
5068         case GTU: return AARCH64_CC;
5069         case LEU: return AARCH64_CS;
5070         case LTU: return AARCH64_HI;
5071         default: return -1;
5072         }
5073       break;
5074
5075     case E_CC_NZmode:
5076       switch (comp_code)
5077         {
5078         case NE: return AARCH64_NE;
5079         case EQ: return AARCH64_EQ;
5080         case GE: return AARCH64_PL;
5081         case LT: return AARCH64_MI;
5082         default: return -1;
5083         }
5084       break;
5085
5086     case E_CC_Zmode:
5087       switch (comp_code)
5088         {
5089         case NE: return AARCH64_NE;
5090         case EQ: return AARCH64_EQ;
5091         default: return -1;
5092         }
5093       break;
5094
5095     case E_CC_Cmode:
5096       switch (comp_code)
5097         {
5098         case NE: return AARCH64_CS;
5099         case EQ: return AARCH64_CC;
5100         default: return -1;
5101         }
5102       break;
5103
5104     default:
5105       return -1;
5106     }
5107
5108   return -1;
5109 }
5110
5111 bool
5112 aarch64_const_vec_all_same_in_range_p (rtx x,
5113                                   HOST_WIDE_INT minval,
5114                                   HOST_WIDE_INT maxval)
5115 {
5116   HOST_WIDE_INT firstval;
5117   int count, i;
5118
5119   if (GET_CODE (x) != CONST_VECTOR
5120       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5121     return false;
5122
5123   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5124   if (firstval < minval || firstval > maxval)
5125     return false;
5126
5127   count = CONST_VECTOR_NUNITS (x);
5128   for (i = 1; i < count; i++)
5129     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5130       return false;
5131
5132   return true;
5133 }
5134
5135 bool
5136 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5137 {
5138   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5139 }
5140
5141
5142 /* N Z C V.  */
5143 #define AARCH64_CC_V 1
5144 #define AARCH64_CC_C (1 << 1)
5145 #define AARCH64_CC_Z (1 << 2)
5146 #define AARCH64_CC_N (1 << 3)
5147
5148 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5149 static const int aarch64_nzcv_codes[] =
5150 {
5151   0,            /* EQ, Z == 1.  */
5152   AARCH64_CC_Z, /* NE, Z == 0.  */
5153   0,            /* CS, C == 1.  */
5154   AARCH64_CC_C, /* CC, C == 0.  */
5155   0,            /* MI, N == 1.  */
5156   AARCH64_CC_N, /* PL, N == 0.  */
5157   0,            /* VS, V == 1.  */
5158   AARCH64_CC_V, /* VC, V == 0.  */
5159   0,            /* HI, C ==1 && Z == 0.  */
5160   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5161   AARCH64_CC_V, /* GE, N == V.  */
5162   0,            /* LT, N != V.  */
5163   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5164   0,            /* LE, !(Z == 0 && N == V).  */
5165   0,            /* AL, Any.  */
5166   0             /* NV, Any.  */
5167 };
5168
5169 /* Print operand X to file F in a target specific manner according to CODE.
5170    The acceptable formatting commands given by CODE are:
5171      'c':               An integer or symbol address without a preceding #
5172                         sign.
5173      'e':               Print the sign/zero-extend size as a character 8->b,
5174                         16->h, 32->w.
5175      'p':               Prints N such that 2^N == X (X must be power of 2 and
5176                         const int).
5177      'P':               Print the number of non-zero bits in X (a const_int).
5178      'H':               Print the higher numbered register of a pair (TImode)
5179                         of regs.
5180      'm':               Print a condition (eq, ne, etc).
5181      'M':               Same as 'm', but invert condition.
5182      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5183      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5184                         The register printed is the FP/SIMD register name
5185                         of X + 0/1/2/3 for S/T/U/V.
5186      'R':               Print a scalar FP/SIMD register name + 1.
5187      'X':               Print bottom 16 bits of integer constant in hex.
5188      'w/x':             Print a general register name or the zero register
5189                         (32-bit or 64-bit).
5190      '0':               Print a normal operand, if it's a general register,
5191                         then we assume DImode.
5192      'k':               Print NZCV for conditional compare instructions.
5193      'A':               Output address constant representing the first
5194                         argument of X, specifying a relocation offset
5195                         if appropriate.
5196      'L':               Output constant address specified by X
5197                         with a relocation offset if appropriate.
5198      'G':               Prints address of X, specifying a PC relative
5199                         relocation mode if appropriate.  */
5200
5201 static void
5202 aarch64_print_operand (FILE *f, rtx x, int code)
5203 {
5204   switch (code)
5205     {
5206     case 'c':
5207       switch (GET_CODE (x))
5208         {
5209         case CONST_INT:
5210           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5211           break;
5212
5213         case SYMBOL_REF:
5214           output_addr_const (f, x);
5215           break;
5216
5217         case CONST:
5218           if (GET_CODE (XEXP (x, 0)) == PLUS
5219               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5220             {
5221               output_addr_const (f, x);
5222               break;
5223             }
5224           /* Fall through.  */
5225
5226         default:
5227           output_operand_lossage ("Unsupported operand for code '%c'", code);
5228         }
5229       break;
5230
5231     case 'e':
5232       {
5233         int n;
5234
5235         if (!CONST_INT_P (x)
5236             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5237           {
5238             output_operand_lossage ("invalid operand for '%%%c'", code);
5239             return;
5240           }
5241
5242         switch (n)
5243           {
5244           case 3:
5245             fputc ('b', f);
5246             break;
5247           case 4:
5248             fputc ('h', f);
5249             break;
5250           case 5:
5251             fputc ('w', f);
5252             break;
5253           default:
5254             output_operand_lossage ("invalid operand for '%%%c'", code);
5255             return;
5256           }
5257       }
5258       break;
5259
5260     case 'p':
5261       {
5262         int n;
5263
5264         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5265           {
5266             output_operand_lossage ("invalid operand for '%%%c'", code);
5267             return;
5268           }
5269
5270         asm_fprintf (f, "%d", n);
5271       }
5272       break;
5273
5274     case 'P':
5275       if (!CONST_INT_P (x))
5276         {
5277           output_operand_lossage ("invalid operand for '%%%c'", code);
5278           return;
5279         }
5280
5281       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5282       break;
5283
5284     case 'H':
5285       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5286         {
5287           output_operand_lossage ("invalid operand for '%%%c'", code);
5288           return;
5289         }
5290
5291       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5292       break;
5293
5294     case 'M':
5295     case 'm':
5296       {
5297         int cond_code;
5298         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5299         if (x == const_true_rtx)
5300           {
5301             if (code == 'M')
5302               fputs ("nv", f);
5303             return;
5304           }
5305
5306         if (!COMPARISON_P (x))
5307           {
5308             output_operand_lossage ("invalid operand for '%%%c'", code);
5309             return;
5310           }
5311
5312         cond_code = aarch64_get_condition_code (x);
5313         gcc_assert (cond_code >= 0);
5314         if (code == 'M')
5315           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5316         fputs (aarch64_condition_codes[cond_code], f);
5317       }
5318       break;
5319
5320     case 'b':
5321     case 'h':
5322     case 's':
5323     case 'd':
5324     case 'q':
5325       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5326         {
5327           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5328           return;
5329         }
5330       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5331       break;
5332
5333     case 'S':
5334     case 'T':
5335     case 'U':
5336     case 'V':
5337       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5338         {
5339           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5340           return;
5341         }
5342       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5343       break;
5344
5345     case 'R':
5346       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5347         {
5348           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5349           return;
5350         }
5351       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5352       break;
5353
5354     case 'X':
5355       if (!CONST_INT_P (x))
5356         {
5357           output_operand_lossage ("invalid operand for '%%%c'", code);
5358           return;
5359         }
5360       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5361       break;
5362
5363     case 'w':
5364     case 'x':
5365       if (x == const0_rtx
5366           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5367         {
5368           asm_fprintf (f, "%czr", code);
5369           break;
5370         }
5371
5372       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5373         {
5374           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5375           break;
5376         }
5377
5378       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5379         {
5380           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5381           break;
5382         }
5383
5384       /* Fall through */
5385
5386     case 0:
5387       if (x == NULL)
5388         {
5389           output_operand_lossage ("missing operand");
5390           return;
5391         }
5392
5393       switch (GET_CODE (x))
5394         {
5395         case REG:
5396           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5397           break;
5398
5399         case MEM:
5400           output_address (GET_MODE (x), XEXP (x, 0));
5401           /* Check all memory references are Pmode - even with ILP32.  */
5402           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5403           break;
5404
5405         case CONST:
5406         case LABEL_REF:
5407         case SYMBOL_REF:
5408           output_addr_const (asm_out_file, x);
5409           break;
5410
5411         case CONST_INT:
5412           asm_fprintf (f, "%wd", INTVAL (x));
5413           break;
5414
5415         case CONST_VECTOR:
5416           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5417             {
5418               gcc_assert (
5419                   aarch64_const_vec_all_same_in_range_p (x,
5420                                                          HOST_WIDE_INT_MIN,
5421                                                          HOST_WIDE_INT_MAX));
5422               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5423             }
5424           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5425             {
5426               fputc ('0', f);
5427             }
5428           else
5429             gcc_unreachable ();
5430           break;
5431
5432         case CONST_DOUBLE:
5433           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5434              be getting CONST_DOUBLEs holding integers.  */
5435           gcc_assert (GET_MODE (x) != VOIDmode);
5436           if (aarch64_float_const_zero_rtx_p (x))
5437             {
5438               fputc ('0', f);
5439               break;
5440             }
5441           else if (aarch64_float_const_representable_p (x))
5442             {
5443 #define buf_size 20
5444               char float_buf[buf_size] = {'\0'};
5445               real_to_decimal_for_mode (float_buf,
5446                                         CONST_DOUBLE_REAL_VALUE (x),
5447                                         buf_size, buf_size,
5448                                         1, GET_MODE (x));
5449               asm_fprintf (asm_out_file, "%s", float_buf);
5450               break;
5451 #undef buf_size
5452             }
5453           output_operand_lossage ("invalid constant");
5454           return;
5455         default:
5456           output_operand_lossage ("invalid operand");
5457           return;
5458         }
5459       break;
5460
5461     case 'A':
5462       if (GET_CODE (x) == HIGH)
5463         x = XEXP (x, 0);
5464
5465       switch (aarch64_classify_symbolic_expression (x))
5466         {
5467         case SYMBOL_SMALL_GOT_4G:
5468           asm_fprintf (asm_out_file, ":got:");
5469           break;
5470
5471         case SYMBOL_SMALL_TLSGD:
5472           asm_fprintf (asm_out_file, ":tlsgd:");
5473           break;
5474
5475         case SYMBOL_SMALL_TLSDESC:
5476           asm_fprintf (asm_out_file, ":tlsdesc:");
5477           break;
5478
5479         case SYMBOL_SMALL_TLSIE:
5480           asm_fprintf (asm_out_file, ":gottprel:");
5481           break;
5482
5483         case SYMBOL_TLSLE24:
5484           asm_fprintf (asm_out_file, ":tprel:");
5485           break;
5486
5487         case SYMBOL_TINY_GOT:
5488           gcc_unreachable ();
5489           break;
5490
5491         default:
5492           break;
5493         }
5494       output_addr_const (asm_out_file, x);
5495       break;
5496
5497     case 'L':
5498       switch (aarch64_classify_symbolic_expression (x))
5499         {
5500         case SYMBOL_SMALL_GOT_4G:
5501           asm_fprintf (asm_out_file, ":lo12:");
5502           break;
5503
5504         case SYMBOL_SMALL_TLSGD:
5505           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5506           break;
5507
5508         case SYMBOL_SMALL_TLSDESC:
5509           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5510           break;
5511
5512         case SYMBOL_SMALL_TLSIE:
5513           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5514           break;
5515
5516         case SYMBOL_TLSLE12:
5517           asm_fprintf (asm_out_file, ":tprel_lo12:");
5518           break;
5519
5520         case SYMBOL_TLSLE24:
5521           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5522           break;
5523
5524         case SYMBOL_TINY_GOT:
5525           asm_fprintf (asm_out_file, ":got:");
5526           break;
5527
5528         case SYMBOL_TINY_TLSIE:
5529           asm_fprintf (asm_out_file, ":gottprel:");
5530           break;
5531
5532         default:
5533           break;
5534         }
5535       output_addr_const (asm_out_file, x);
5536       break;
5537
5538     case 'G':
5539       switch (aarch64_classify_symbolic_expression (x))
5540         {
5541         case SYMBOL_TLSLE24:
5542           asm_fprintf (asm_out_file, ":tprel_hi12:");
5543           break;
5544         default:
5545           break;
5546         }
5547       output_addr_const (asm_out_file, x);
5548       break;
5549
5550     case 'k':
5551       {
5552         HOST_WIDE_INT cond_code;
5553
5554         if (!CONST_INT_P (x))
5555           {
5556             output_operand_lossage ("invalid operand for '%%%c'", code);
5557             return;
5558           }
5559
5560         cond_code = INTVAL (x);
5561         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5562         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5563       }
5564       break;
5565
5566     default:
5567       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5568       return;
5569     }
5570 }
5571
5572 static void
5573 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5574 {
5575   struct aarch64_address_info addr;
5576
5577   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5578     switch (addr.type)
5579       {
5580       case ADDRESS_REG_IMM:
5581         if (addr.offset == const0_rtx)
5582           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5583         else
5584           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5585                        INTVAL (addr.offset));
5586         return;
5587
5588       case ADDRESS_REG_REG:
5589         if (addr.shift == 0)
5590           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5591                        reg_names [REGNO (addr.offset)]);
5592         else
5593           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5594                        reg_names [REGNO (addr.offset)], addr.shift);
5595         return;
5596
5597       case ADDRESS_REG_UXTW:
5598         if (addr.shift == 0)
5599           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5600                        REGNO (addr.offset) - R0_REGNUM);
5601         else
5602           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5603                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5604         return;
5605
5606       case ADDRESS_REG_SXTW:
5607         if (addr.shift == 0)
5608           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5609                        REGNO (addr.offset) - R0_REGNUM);
5610         else
5611           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5612                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5613         return;
5614
5615       case ADDRESS_REG_WB:
5616         switch (GET_CODE (x))
5617           {
5618           case PRE_INC:
5619             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5620                          GET_MODE_SIZE (mode));
5621             return;
5622           case POST_INC:
5623             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5624                          GET_MODE_SIZE (mode));
5625             return;
5626           case PRE_DEC:
5627             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5628                          GET_MODE_SIZE (mode));
5629             return;
5630           case POST_DEC:
5631             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5632                          GET_MODE_SIZE (mode));
5633             return;
5634           case PRE_MODIFY:
5635             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5636                          INTVAL (addr.offset));
5637             return;
5638           case POST_MODIFY:
5639             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5640                          INTVAL (addr.offset));
5641             return;
5642           default:
5643             break;
5644           }
5645         break;
5646
5647       case ADDRESS_LO_SUM:
5648         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5649         output_addr_const (f, addr.offset);
5650         asm_fprintf (f, "]");
5651         return;
5652
5653       case ADDRESS_SYMBOLIC:
5654         break;
5655       }
5656
5657   output_addr_const (f, x);
5658 }
5659
5660 bool
5661 aarch64_label_mentioned_p (rtx x)
5662 {
5663   const char *fmt;
5664   int i;
5665
5666   if (GET_CODE (x) == LABEL_REF)
5667     return true;
5668
5669   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5670      referencing instruction, but they are constant offsets, not
5671      symbols.  */
5672   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5673     return false;
5674
5675   fmt = GET_RTX_FORMAT (GET_CODE (x));
5676   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5677     {
5678       if (fmt[i] == 'E')
5679         {
5680           int j;
5681
5682           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5683             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5684               return 1;
5685         }
5686       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5687         return 1;
5688     }
5689
5690   return 0;
5691 }
5692
5693 /* Implement REGNO_REG_CLASS.  */
5694
5695 enum reg_class
5696 aarch64_regno_regclass (unsigned regno)
5697 {
5698   if (GP_REGNUM_P (regno))
5699     return GENERAL_REGS;
5700
5701   if (regno == SP_REGNUM)
5702     return STACK_REG;
5703
5704   if (regno == FRAME_POINTER_REGNUM
5705       || regno == ARG_POINTER_REGNUM)
5706     return POINTER_REGS;
5707
5708   if (FP_REGNUM_P (regno))
5709     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5710
5711   return NO_REGS;
5712 }
5713
5714 static rtx
5715 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5716 {
5717   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5718      where mask is selected by alignment and size of the offset.
5719      We try to pick as large a range for the offset as possible to
5720      maximize the chance of a CSE.  However, for aligned addresses
5721      we limit the range to 4k so that structures with different sized
5722      elements are likely to use the same base.  We need to be careful
5723      not to split a CONST for some forms of address expression, otherwise
5724      it will generate sub-optimal code.  */
5725
5726   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5727     {
5728       rtx base = XEXP (x, 0);
5729       rtx offset_rtx = XEXP (x, 1);
5730       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5731
5732       if (GET_CODE (base) == PLUS)
5733         {
5734           rtx op0 = XEXP (base, 0);
5735           rtx op1 = XEXP (base, 1);
5736
5737           /* Force any scaling into a temp for CSE.  */
5738           op0 = force_reg (Pmode, op0);
5739           op1 = force_reg (Pmode, op1);
5740
5741           /* Let the pointer register be in op0.  */
5742           if (REG_POINTER (op1))
5743             std::swap (op0, op1);
5744
5745           /* If the pointer is virtual or frame related, then we know that
5746              virtual register instantiation or register elimination is going
5747              to apply a second constant.  We want the two constants folded
5748              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5749           if (virt_or_elim_regno_p (REGNO (op0)))
5750             {
5751               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5752                                    NULL_RTX, true, OPTAB_DIRECT);
5753               return gen_rtx_PLUS (Pmode, base, op1);
5754             }
5755
5756           /* Otherwise, in order to encourage CSE (and thence loop strength
5757              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5758           base = expand_binop (Pmode, add_optab, op0, op1,
5759                                NULL_RTX, true, OPTAB_DIRECT);
5760           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5761         }
5762
5763       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5764       HOST_WIDE_INT base_offset;
5765       if (GET_MODE_SIZE (mode) > 16)
5766         base_offset = (offset + 0x400) & ~0x7f0;
5767       /* For offsets aren't a multiple of the access size, the limit is
5768          -256...255.  */
5769       else if (offset & (GET_MODE_SIZE (mode) - 1))
5770         {
5771           base_offset = (offset + 0x100) & ~0x1ff;
5772
5773           /* BLKmode typically uses LDP of X-registers.  */
5774           if (mode == BLKmode)
5775             base_offset = (offset + 512) & ~0x3ff;
5776         }
5777       /* Small negative offsets are supported.  */
5778       else if (IN_RANGE (offset, -256, 0))
5779         base_offset = 0;
5780       else if (mode == TImode || mode == TFmode)
5781         base_offset = (offset + 0x100) & ~0x1ff;
5782       /* Use 12-bit offset by access size.  */
5783       else
5784         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5785
5786       if (base_offset != 0)
5787         {
5788           base = plus_constant (Pmode, base, base_offset);
5789           base = force_operand (base, NULL_RTX);
5790           return plus_constant (Pmode, base, offset - base_offset);
5791         }
5792     }
5793
5794   return x;
5795 }
5796
5797 /* Return the reload icode required for a constant pool in mode.  */
5798 static enum insn_code
5799 aarch64_constant_pool_reload_icode (machine_mode mode)
5800 {
5801   switch (mode)
5802     {
5803     case E_SFmode:
5804       return CODE_FOR_aarch64_reload_movcpsfdi;
5805
5806     case E_DFmode:
5807       return CODE_FOR_aarch64_reload_movcpdfdi;
5808
5809     case E_TFmode:
5810       return CODE_FOR_aarch64_reload_movcptfdi;
5811
5812     case E_V8QImode:
5813       return CODE_FOR_aarch64_reload_movcpv8qidi;
5814
5815     case E_V16QImode:
5816       return CODE_FOR_aarch64_reload_movcpv16qidi;
5817
5818     case E_V4HImode:
5819       return CODE_FOR_aarch64_reload_movcpv4hidi;
5820
5821     case E_V8HImode:
5822       return CODE_FOR_aarch64_reload_movcpv8hidi;
5823
5824     case E_V2SImode:
5825       return CODE_FOR_aarch64_reload_movcpv2sidi;
5826
5827     case E_V4SImode:
5828       return CODE_FOR_aarch64_reload_movcpv4sidi;
5829
5830     case E_V2DImode:
5831       return CODE_FOR_aarch64_reload_movcpv2didi;
5832
5833     case E_V2DFmode:
5834       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5835
5836     default:
5837       gcc_unreachable ();
5838     }
5839
5840   gcc_unreachable ();
5841 }
5842 static reg_class_t
5843 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5844                           reg_class_t rclass,
5845                           machine_mode mode,
5846                           secondary_reload_info *sri)
5847 {
5848
5849   /* If we have to disable direct literal pool loads and stores because the
5850      function is too big, then we need a scratch register.  */
5851   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5852       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5853           || targetm.vector_mode_supported_p (GET_MODE (x)))
5854       && !aarch64_pcrelative_literal_loads)
5855     {
5856       sri->icode = aarch64_constant_pool_reload_icode (mode);
5857       return NO_REGS;
5858     }
5859
5860   /* Without the TARGET_SIMD instructions we cannot move a Q register
5861      to a Q register directly.  We need a scratch.  */
5862   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5863       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5864       && reg_class_subset_p (rclass, FP_REGS))
5865     {
5866       if (mode == TFmode)
5867         sri->icode = CODE_FOR_aarch64_reload_movtf;
5868       else if (mode == TImode)
5869         sri->icode = CODE_FOR_aarch64_reload_movti;
5870       return NO_REGS;
5871     }
5872
5873   /* A TFmode or TImode memory access should be handled via an FP_REGS
5874      because AArch64 has richer addressing modes for LDR/STR instructions
5875      than LDP/STP instructions.  */
5876   if (TARGET_FLOAT && rclass == GENERAL_REGS
5877       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5878     return FP_REGS;
5879
5880   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5881       return GENERAL_REGS;
5882
5883   return NO_REGS;
5884 }
5885
5886 static bool
5887 aarch64_can_eliminate (const int from, const int to)
5888 {
5889   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5890      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5891
5892   if (frame_pointer_needed)
5893     {
5894       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5895         return true;
5896       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5897         return false;
5898       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5899           && !cfun->calls_alloca)
5900         return true;
5901       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5902         return true;
5903
5904       return false;
5905     }
5906   else
5907     {
5908       /* If we decided that we didn't need a leaf frame pointer but then used
5909          LR in the function, then we'll want a frame pointer after all, so
5910          prevent this elimination to ensure a frame pointer is used.  */
5911       if (to == STACK_POINTER_REGNUM
5912           && flag_omit_leaf_frame_pointer
5913           && df_regs_ever_live_p (LR_REGNUM))
5914         return false;
5915     }
5916
5917   return true;
5918 }
5919
5920 HOST_WIDE_INT
5921 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5922 {
5923   aarch64_layout_frame ();
5924
5925   if (to == HARD_FRAME_POINTER_REGNUM)
5926     {
5927       if (from == ARG_POINTER_REGNUM)
5928         return cfun->machine->frame.hard_fp_offset;
5929
5930       if (from == FRAME_POINTER_REGNUM)
5931         return cfun->machine->frame.hard_fp_offset
5932                - cfun->machine->frame.locals_offset;
5933     }
5934
5935   if (to == STACK_POINTER_REGNUM)
5936     {
5937       if (from == FRAME_POINTER_REGNUM)
5938           return cfun->machine->frame.frame_size
5939                  - cfun->machine->frame.locals_offset;
5940     }
5941
5942   return cfun->machine->frame.frame_size;
5943 }
5944
5945 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5946    previous frame.  */
5947
5948 rtx
5949 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5950 {
5951   if (count != 0)
5952     return const0_rtx;
5953   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5954 }
5955
5956
5957 static void
5958 aarch64_asm_trampoline_template (FILE *f)
5959 {
5960   if (TARGET_ILP32)
5961     {
5962       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5963       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5964     }
5965   else
5966     {
5967       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5968       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5969     }
5970   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5971   assemble_aligned_integer (4, const0_rtx);
5972   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5973   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5974 }
5975
5976 static void
5977 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5978 {
5979   rtx fnaddr, mem, a_tramp;
5980   const int tramp_code_sz = 16;
5981
5982   /* Don't need to copy the trailing D-words, we fill those in below.  */
5983   emit_block_move (m_tramp, assemble_trampoline_template (),
5984                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5985   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5986   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5987   if (GET_MODE (fnaddr) != ptr_mode)
5988     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5989   emit_move_insn (mem, fnaddr);
5990
5991   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5992   emit_move_insn (mem, chain_value);
5993
5994   /* XXX We should really define a "clear_cache" pattern and use
5995      gen_clear_cache().  */
5996   a_tramp = XEXP (m_tramp, 0);
5997   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5998                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
5999                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6000                      ptr_mode);
6001 }
6002
6003 static unsigned char
6004 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6005 {
6006   switch (regclass)
6007     {
6008     case CALLER_SAVE_REGS:
6009     case POINTER_REGS:
6010     case GENERAL_REGS:
6011     case ALL_REGS:
6012     case FP_REGS:
6013     case FP_LO_REGS:
6014       return
6015         aarch64_vector_mode_p (mode)
6016           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6017           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6018     case STACK_REG:
6019       return 1;
6020
6021     case NO_REGS:
6022       return 0;
6023
6024     default:
6025       break;
6026     }
6027   gcc_unreachable ();
6028 }
6029
6030 static reg_class_t
6031 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6032 {
6033   if (regclass == POINTER_REGS)
6034     return GENERAL_REGS;
6035
6036   if (regclass == STACK_REG)
6037     {
6038       if (REG_P(x)
6039           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6040           return regclass;
6041
6042       return NO_REGS;
6043     }
6044
6045   /* Register eliminiation can result in a request for
6046      SP+constant->FP_REGS.  We cannot support such operations which
6047      use SP as source and an FP_REG as destination, so reject out
6048      right now.  */
6049   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6050     {
6051       rtx lhs = XEXP (x, 0);
6052
6053       /* Look through a possible SUBREG introduced by ILP32.  */
6054       if (GET_CODE (lhs) == SUBREG)
6055         lhs = SUBREG_REG (lhs);
6056
6057       gcc_assert (REG_P (lhs));
6058       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6059                                       POINTER_REGS));
6060       return NO_REGS;
6061     }
6062
6063   return regclass;
6064 }
6065
6066 void
6067 aarch64_asm_output_labelref (FILE* f, const char *name)
6068 {
6069   asm_fprintf (f, "%U%s", name);
6070 }
6071
6072 static void
6073 aarch64_elf_asm_constructor (rtx symbol, int priority)
6074 {
6075   if (priority == DEFAULT_INIT_PRIORITY)
6076     default_ctor_section_asm_out_constructor (symbol, priority);
6077   else
6078     {
6079       section *s;
6080       /* While priority is known to be in range [0, 65535], so 18 bytes
6081          would be enough, the compiler might not know that.  To avoid
6082          -Wformat-truncation false positive, use a larger size.  */
6083       char buf[23];
6084       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6085       s = get_section (buf, SECTION_WRITE, NULL);
6086       switch_to_section (s);
6087       assemble_align (POINTER_SIZE);
6088       assemble_aligned_integer (POINTER_BYTES, symbol);
6089     }
6090 }
6091
6092 static void
6093 aarch64_elf_asm_destructor (rtx symbol, int priority)
6094 {
6095   if (priority == DEFAULT_INIT_PRIORITY)
6096     default_dtor_section_asm_out_destructor (symbol, priority);
6097   else
6098     {
6099       section *s;
6100       /* While priority is known to be in range [0, 65535], so 18 bytes
6101          would be enough, the compiler might not know that.  To avoid
6102          -Wformat-truncation false positive, use a larger size.  */
6103       char buf[23];
6104       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6105       s = get_section (buf, SECTION_WRITE, NULL);
6106       switch_to_section (s);
6107       assemble_align (POINTER_SIZE);
6108       assemble_aligned_integer (POINTER_BYTES, symbol);
6109     }
6110 }
6111
6112 const char*
6113 aarch64_output_casesi (rtx *operands)
6114 {
6115   char buf[100];
6116   char label[100];
6117   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6118   int index;
6119   static const char *const patterns[4][2] =
6120   {
6121     {
6122       "ldrb\t%w3, [%0,%w1,uxtw]",
6123       "add\t%3, %4, %w3, sxtb #2"
6124     },
6125     {
6126       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6127       "add\t%3, %4, %w3, sxth #2"
6128     },
6129     {
6130       "ldr\t%w3, [%0,%w1,uxtw #2]",
6131       "add\t%3, %4, %w3, sxtw #2"
6132     },
6133     /* We assume that DImode is only generated when not optimizing and
6134        that we don't really need 64-bit address offsets.  That would
6135        imply an object file with 8GB of code in a single function!  */
6136     {
6137       "ldr\t%w3, [%0,%w1,uxtw #2]",
6138       "add\t%3, %4, %w3, sxtw #2"
6139     }
6140   };
6141
6142   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6143
6144   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6145   index = exact_log2 (GET_MODE_SIZE (mode));
6146
6147   gcc_assert (index >= 0 && index <= 3);
6148
6149   /* Need to implement table size reduction, by chaning the code below.  */
6150   output_asm_insn (patterns[index][0], operands);
6151   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6152   snprintf (buf, sizeof (buf),
6153             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6154   output_asm_insn (buf, operands);
6155   output_asm_insn (patterns[index][1], operands);
6156   output_asm_insn ("br\t%3", operands);
6157   assemble_label (asm_out_file, label);
6158   return "";
6159 }
6160
6161
6162 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6163    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6164    operator.  */
6165
6166 int
6167 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6168 {
6169   if (shift >= 0 && shift <= 3)
6170     {
6171       int size;
6172       for (size = 8; size <= 32; size *= 2)
6173         {
6174           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6175           if (mask == bits << shift)
6176             return size;
6177         }
6178     }
6179   return 0;
6180 }
6181
6182 /* Constant pools are per function only when PC relative
6183    literal loads are true or we are in the large memory
6184    model.  */
6185
6186 static inline bool
6187 aarch64_can_use_per_function_literal_pools_p (void)
6188 {
6189   return (aarch64_pcrelative_literal_loads
6190           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6191 }
6192
6193 static bool
6194 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6195 {
6196   /* Fixme:: In an ideal world this would work similar
6197      to the logic in aarch64_select_rtx_section but this
6198      breaks bootstrap in gcc go.  For now we workaround
6199      this by returning false here.  */
6200   return false;
6201 }
6202
6203 /* Select appropriate section for constants depending
6204    on where we place literal pools.  */
6205
6206 static section *
6207 aarch64_select_rtx_section (machine_mode mode,
6208                             rtx x,
6209                             unsigned HOST_WIDE_INT align)
6210 {
6211   if (aarch64_can_use_per_function_literal_pools_p ())
6212     return function_section (current_function_decl);
6213
6214   return default_elf_select_rtx_section (mode, x, align);
6215 }
6216
6217 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6218 void
6219 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6220                                   HOST_WIDE_INT offset)
6221 {
6222   /* When using per-function literal pools, we must ensure that any code
6223      section is aligned to the minimal instruction length, lest we get
6224      errors from the assembler re "unaligned instructions".  */
6225   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6226     ASM_OUTPUT_ALIGN (f, 2);
6227 }
6228
6229 /* Costs.  */
6230
6231 /* Helper function for rtx cost calculation.  Strip a shift expression
6232    from X.  Returns the inner operand if successful, or the original
6233    expression on failure.  */
6234 static rtx
6235 aarch64_strip_shift (rtx x)
6236 {
6237   rtx op = x;
6238
6239   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6240      we can convert both to ROR during final output.  */
6241   if ((GET_CODE (op) == ASHIFT
6242        || GET_CODE (op) == ASHIFTRT
6243        || GET_CODE (op) == LSHIFTRT
6244        || GET_CODE (op) == ROTATERT
6245        || GET_CODE (op) == ROTATE)
6246       && CONST_INT_P (XEXP (op, 1)))
6247     return XEXP (op, 0);
6248
6249   if (GET_CODE (op) == MULT
6250       && CONST_INT_P (XEXP (op, 1))
6251       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6252     return XEXP (op, 0);
6253
6254   return x;
6255 }
6256
6257 /* Helper function for rtx cost calculation.  Strip an extend
6258    expression from X.  Returns the inner operand if successful, or the
6259    original expression on failure.  We deal with a number of possible
6260    canonicalization variations here. If STRIP_SHIFT is true, then
6261    we can strip off a shift also.  */
6262 static rtx
6263 aarch64_strip_extend (rtx x, bool strip_shift)
6264 {
6265   scalar_int_mode mode;
6266   rtx op = x;
6267
6268   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6269     return op;
6270
6271   /* Zero and sign extraction of a widened value.  */
6272   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6273       && XEXP (op, 2) == const0_rtx
6274       && GET_CODE (XEXP (op, 0)) == MULT
6275       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6276                                          XEXP (op, 1)))
6277     return XEXP (XEXP (op, 0), 0);
6278
6279   /* It can also be represented (for zero-extend) as an AND with an
6280      immediate.  */
6281   if (GET_CODE (op) == AND
6282       && GET_CODE (XEXP (op, 0)) == MULT
6283       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6284       && CONST_INT_P (XEXP (op, 1))
6285       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6286                            INTVAL (XEXP (op, 1))) != 0)
6287     return XEXP (XEXP (op, 0), 0);
6288
6289   /* Now handle extended register, as this may also have an optional
6290      left shift by 1..4.  */
6291   if (strip_shift
6292       && GET_CODE (op) == ASHIFT
6293       && CONST_INT_P (XEXP (op, 1))
6294       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6295     op = XEXP (op, 0);
6296
6297   if (GET_CODE (op) == ZERO_EXTEND
6298       || GET_CODE (op) == SIGN_EXTEND)
6299     op = XEXP (op, 0);
6300
6301   if (op != x)
6302     return op;
6303
6304   return x;
6305 }
6306
6307 /* Return true iff CODE is a shift supported in combination
6308    with arithmetic instructions.  */
6309
6310 static bool
6311 aarch64_shift_p (enum rtx_code code)
6312 {
6313   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6314 }
6315
6316
6317 /* Return true iff X is a cheap shift without a sign extend. */
6318
6319 static bool
6320 aarch64_cheap_mult_shift_p (rtx x)
6321 {
6322   rtx op0, op1;
6323
6324   op0 = XEXP (x, 0);
6325   op1 = XEXP (x, 1);
6326
6327   if (!(aarch64_tune_params.extra_tuning_flags
6328                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6329     return false;
6330
6331   if (GET_CODE (op0) == SIGN_EXTEND)
6332     return false;
6333
6334   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6335       && UINTVAL (op1) <= 4)
6336     return true;
6337
6338   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6339     return false;
6340
6341   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6342
6343   if (l2 > 0 && l2 <= 4)
6344     return true;
6345
6346   return false;
6347 }
6348
6349 /* Helper function for rtx cost calculation.  Calculate the cost of
6350    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6351    Return the calculated cost of the expression, recursing manually in to
6352    operands where needed.  */
6353
6354 static int
6355 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6356 {
6357   rtx op0, op1;
6358   const struct cpu_cost_table *extra_cost
6359     = aarch64_tune_params.insn_extra_cost;
6360   int cost = 0;
6361   bool compound_p = (outer == PLUS || outer == MINUS);
6362   machine_mode mode = GET_MODE (x);
6363
6364   gcc_checking_assert (code == MULT);
6365
6366   op0 = XEXP (x, 0);
6367   op1 = XEXP (x, 1);
6368
6369   if (VECTOR_MODE_P (mode))
6370     mode = GET_MODE_INNER (mode);
6371
6372   /* Integer multiply/fma.  */
6373   if (GET_MODE_CLASS (mode) == MODE_INT)
6374     {
6375       /* The multiply will be canonicalized as a shift, cost it as such.  */
6376       if (aarch64_shift_p (GET_CODE (x))
6377           || (CONST_INT_P (op1)
6378               && exact_log2 (INTVAL (op1)) > 0))
6379         {
6380           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6381                            || GET_CODE (op0) == SIGN_EXTEND;
6382           if (speed)
6383             {
6384               if (compound_p)
6385                 {
6386                   /* If the shift is considered cheap,
6387                      then don't add any cost. */
6388                   if (aarch64_cheap_mult_shift_p (x))
6389                     ;
6390                   else if (REG_P (op1))
6391                     /* ARITH + shift-by-register.  */
6392                     cost += extra_cost->alu.arith_shift_reg;
6393                   else if (is_extend)
6394                     /* ARITH + extended register.  We don't have a cost field
6395                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6396                     cost += extra_cost->alu.extend_arith;
6397                   else
6398                     /* ARITH + shift-by-immediate.  */
6399                     cost += extra_cost->alu.arith_shift;
6400                 }
6401               else
6402                 /* LSL (immediate).  */
6403                 cost += extra_cost->alu.shift;
6404
6405             }
6406           /* Strip extends as we will have costed them in the case above.  */
6407           if (is_extend)
6408             op0 = aarch64_strip_extend (op0, true);
6409
6410           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6411
6412           return cost;
6413         }
6414
6415       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6416          compound and let the below cases handle it.  After all, MNEG is a
6417          special-case alias of MSUB.  */
6418       if (GET_CODE (op0) == NEG)
6419         {
6420           op0 = XEXP (op0, 0);
6421           compound_p = true;
6422         }
6423
6424       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6425       if ((GET_CODE (op0) == ZERO_EXTEND
6426            && GET_CODE (op1) == ZERO_EXTEND)
6427           || (GET_CODE (op0) == SIGN_EXTEND
6428               && GET_CODE (op1) == SIGN_EXTEND))
6429         {
6430           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6431           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6432
6433           if (speed)
6434             {
6435               if (compound_p)
6436                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6437                 cost += extra_cost->mult[0].extend_add;
6438               else
6439                 /* MUL/SMULL/UMULL.  */
6440                 cost += extra_cost->mult[0].extend;
6441             }
6442
6443           return cost;
6444         }
6445
6446       /* This is either an integer multiply or a MADD.  In both cases
6447          we want to recurse and cost the operands.  */
6448       cost += rtx_cost (op0, mode, MULT, 0, speed);
6449       cost += rtx_cost (op1, mode, MULT, 1, speed);
6450
6451       if (speed)
6452         {
6453           if (compound_p)
6454             /* MADD/MSUB.  */
6455             cost += extra_cost->mult[mode == DImode].add;
6456           else
6457             /* MUL.  */
6458             cost += extra_cost->mult[mode == DImode].simple;
6459         }
6460
6461       return cost;
6462     }
6463   else
6464     {
6465       if (speed)
6466         {
6467           /* Floating-point FMA/FMUL can also support negations of the
6468              operands, unless the rounding mode is upward or downward in
6469              which case FNMUL is different than FMUL with operand negation.  */
6470           bool neg0 = GET_CODE (op0) == NEG;
6471           bool neg1 = GET_CODE (op1) == NEG;
6472           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6473             {
6474               if (neg0)
6475                 op0 = XEXP (op0, 0);
6476               if (neg1)
6477                 op1 = XEXP (op1, 0);
6478             }
6479
6480           if (compound_p)
6481             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6482             cost += extra_cost->fp[mode == DFmode].fma;
6483           else
6484             /* FMUL/FNMUL.  */
6485             cost += extra_cost->fp[mode == DFmode].mult;
6486         }
6487
6488       cost += rtx_cost (op0, mode, MULT, 0, speed);
6489       cost += rtx_cost (op1, mode, MULT, 1, speed);
6490       return cost;
6491     }
6492 }
6493
6494 static int
6495 aarch64_address_cost (rtx x,
6496                       machine_mode mode,
6497                       addr_space_t as ATTRIBUTE_UNUSED,
6498                       bool speed)
6499 {
6500   enum rtx_code c = GET_CODE (x);
6501   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6502   struct aarch64_address_info info;
6503   int cost = 0;
6504   info.shift = 0;
6505
6506   if (!aarch64_classify_address (&info, x, mode, c, false))
6507     {
6508       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6509         {
6510           /* This is a CONST or SYMBOL ref which will be split
6511              in a different way depending on the code model in use.
6512              Cost it through the generic infrastructure.  */
6513           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6514           /* Divide through by the cost of one instruction to
6515              bring it to the same units as the address costs.  */
6516           cost_symbol_ref /= COSTS_N_INSNS (1);
6517           /* The cost is then the cost of preparing the address,
6518              followed by an immediate (possibly 0) offset.  */
6519           return cost_symbol_ref + addr_cost->imm_offset;
6520         }
6521       else
6522         {
6523           /* This is most likely a jump table from a case
6524              statement.  */
6525           return addr_cost->register_offset;
6526         }
6527     }
6528
6529   switch (info.type)
6530     {
6531       case ADDRESS_LO_SUM:
6532       case ADDRESS_SYMBOLIC:
6533       case ADDRESS_REG_IMM:
6534         cost += addr_cost->imm_offset;
6535         break;
6536
6537       case ADDRESS_REG_WB:
6538         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6539           cost += addr_cost->pre_modify;
6540         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6541           cost += addr_cost->post_modify;
6542         else
6543           gcc_unreachable ();
6544
6545         break;
6546
6547       case ADDRESS_REG_REG:
6548         cost += addr_cost->register_offset;
6549         break;
6550
6551       case ADDRESS_REG_SXTW:
6552         cost += addr_cost->register_sextend;
6553         break;
6554
6555       case ADDRESS_REG_UXTW:
6556         cost += addr_cost->register_zextend;
6557         break;
6558
6559       default:
6560         gcc_unreachable ();
6561     }
6562
6563
6564   if (info.shift > 0)
6565     {
6566       /* For the sake of calculating the cost of the shifted register
6567          component, we can treat same sized modes in the same way.  */
6568       switch (GET_MODE_BITSIZE (mode))
6569         {
6570           case 16:
6571             cost += addr_cost->addr_scale_costs.hi;
6572             break;
6573
6574           case 32:
6575             cost += addr_cost->addr_scale_costs.si;
6576             break;
6577
6578           case 64:
6579             cost += addr_cost->addr_scale_costs.di;
6580             break;
6581
6582           /* We can't tell, or this is a 128-bit vector.  */
6583           default:
6584             cost += addr_cost->addr_scale_costs.ti;
6585             break;
6586         }
6587     }
6588
6589   return cost;
6590 }
6591
6592 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6593    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6594    to be taken.  */
6595
6596 int
6597 aarch64_branch_cost (bool speed_p, bool predictable_p)
6598 {
6599   /* When optimizing for speed, use the cost of unpredictable branches.  */
6600   const struct cpu_branch_cost *branch_costs =
6601     aarch64_tune_params.branch_costs;
6602
6603   if (!speed_p || predictable_p)
6604     return branch_costs->predictable;
6605   else
6606     return branch_costs->unpredictable;
6607 }
6608
6609 /* Return true if the RTX X in mode MODE is a zero or sign extract
6610    usable in an ADD or SUB (extended register) instruction.  */
6611 static bool
6612 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6613 {
6614   /* Catch add with a sign extract.
6615      This is add_<optab><mode>_multp2.  */
6616   if (GET_CODE (x) == SIGN_EXTRACT
6617       || GET_CODE (x) == ZERO_EXTRACT)
6618     {
6619       rtx op0 = XEXP (x, 0);
6620       rtx op1 = XEXP (x, 1);
6621       rtx op2 = XEXP (x, 2);
6622
6623       if (GET_CODE (op0) == MULT
6624           && CONST_INT_P (op1)
6625           && op2 == const0_rtx
6626           && CONST_INT_P (XEXP (op0, 1))
6627           && aarch64_is_extend_from_extract (mode,
6628                                              XEXP (op0, 1),
6629                                              op1))
6630         {
6631           return true;
6632         }
6633     }
6634   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6635      No shift.  */
6636   else if (GET_CODE (x) == SIGN_EXTEND
6637            || GET_CODE (x) == ZERO_EXTEND)
6638     return REG_P (XEXP (x, 0));
6639
6640   return false;
6641 }
6642
6643 static bool
6644 aarch64_frint_unspec_p (unsigned int u)
6645 {
6646   switch (u)
6647     {
6648       case UNSPEC_FRINTZ:
6649       case UNSPEC_FRINTP:
6650       case UNSPEC_FRINTM:
6651       case UNSPEC_FRINTA:
6652       case UNSPEC_FRINTN:
6653       case UNSPEC_FRINTX:
6654       case UNSPEC_FRINTI:
6655         return true;
6656
6657       default:
6658         return false;
6659     }
6660 }
6661
6662 /* Return true iff X is an rtx that will match an extr instruction
6663    i.e. as described in the *extr<mode>5_insn family of patterns.
6664    OP0 and OP1 will be set to the operands of the shifts involved
6665    on success and will be NULL_RTX otherwise.  */
6666
6667 static bool
6668 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6669 {
6670   rtx op0, op1;
6671   scalar_int_mode mode;
6672   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6673     return false;
6674
6675   *res_op0 = NULL_RTX;
6676   *res_op1 = NULL_RTX;
6677
6678   if (GET_CODE (x) != IOR)
6679     return false;
6680
6681   op0 = XEXP (x, 0);
6682   op1 = XEXP (x, 1);
6683
6684   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6685       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6686     {
6687      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6688       if (GET_CODE (op1) == ASHIFT)
6689         std::swap (op0, op1);
6690
6691       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6692         return false;
6693
6694       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6695       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6696
6697       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6698           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6699         {
6700           *res_op0 = XEXP (op0, 0);
6701           *res_op1 = XEXP (op1, 0);
6702           return true;
6703         }
6704     }
6705
6706   return false;
6707 }
6708
6709 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6710    storing it in *COST.  Result is true if the total cost of the operation
6711    has now been calculated.  */
6712 static bool
6713 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6714 {
6715   rtx inner;
6716   rtx comparator;
6717   enum rtx_code cmpcode;
6718
6719   if (COMPARISON_P (op0))
6720     {
6721       inner = XEXP (op0, 0);
6722       comparator = XEXP (op0, 1);
6723       cmpcode = GET_CODE (op0);
6724     }
6725   else
6726     {
6727       inner = op0;
6728       comparator = const0_rtx;
6729       cmpcode = NE;
6730     }
6731
6732   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6733     {
6734       /* Conditional branch.  */
6735       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6736         return true;
6737       else
6738         {
6739           if (cmpcode == NE || cmpcode == EQ)
6740             {
6741               if (comparator == const0_rtx)
6742                 {
6743                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6744                   if (GET_CODE (inner) == ZERO_EXTRACT)
6745                     /* TBZ/TBNZ.  */
6746                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6747                                        ZERO_EXTRACT, 0, speed);
6748                   else
6749                     /* CBZ/CBNZ.  */
6750                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6751
6752                 return true;
6753               }
6754             }
6755           else if (cmpcode == LT || cmpcode == GE)
6756             {
6757               /* TBZ/TBNZ.  */
6758               if (comparator == const0_rtx)
6759                 return true;
6760             }
6761         }
6762     }
6763   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6764     {
6765       /* CCMP.  */
6766       if (GET_CODE (op1) == COMPARE)
6767         {
6768           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6769           if (XEXP (op1, 1) == const0_rtx)
6770             *cost += 1;
6771           if (speed)
6772             {
6773               machine_mode mode = GET_MODE (XEXP (op1, 0));
6774               const struct cpu_cost_table *extra_cost
6775                 = aarch64_tune_params.insn_extra_cost;
6776
6777               if (GET_MODE_CLASS (mode) == MODE_INT)
6778                 *cost += extra_cost->alu.arith;
6779               else
6780                 *cost += extra_cost->fp[mode == DFmode].compare;
6781             }
6782           return true;
6783         }
6784
6785       /* It's a conditional operation based on the status flags,
6786          so it must be some flavor of CSEL.  */
6787
6788       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6789       if (GET_CODE (op1) == NEG
6790           || GET_CODE (op1) == NOT
6791           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6792         op1 = XEXP (op1, 0);
6793       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6794         {
6795           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6796           op1 = XEXP (op1, 0);
6797           op2 = XEXP (op2, 0);
6798         }
6799
6800       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6801       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6802       return true;
6803     }
6804
6805   /* We don't know what this is, cost all operands.  */
6806   return false;
6807 }
6808
6809 /* Check whether X is a bitfield operation of the form shift + extend that
6810    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6811    operand to which the bitfield operation is applied.  Otherwise return
6812    NULL_RTX.  */
6813
6814 static rtx
6815 aarch64_extend_bitfield_pattern_p (rtx x)
6816 {
6817   rtx_code outer_code = GET_CODE (x);
6818   machine_mode outer_mode = GET_MODE (x);
6819
6820   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6821       && outer_mode != SImode && outer_mode != DImode)
6822     return NULL_RTX;
6823
6824   rtx inner = XEXP (x, 0);
6825   rtx_code inner_code = GET_CODE (inner);
6826   machine_mode inner_mode = GET_MODE (inner);
6827   rtx op = NULL_RTX;
6828
6829   switch (inner_code)
6830     {
6831       case ASHIFT:
6832         if (CONST_INT_P (XEXP (inner, 1))
6833             && (inner_mode == QImode || inner_mode == HImode))
6834           op = XEXP (inner, 0);
6835         break;
6836       case LSHIFTRT:
6837         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6838             && (inner_mode == QImode || inner_mode == HImode))
6839           op = XEXP (inner, 0);
6840         break;
6841       case ASHIFTRT:
6842         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6843             && (inner_mode == QImode || inner_mode == HImode))
6844           op = XEXP (inner, 0);
6845         break;
6846       default:
6847         break;
6848     }
6849
6850   return op;
6851 }
6852
6853 /* Return true if the mask and a shift amount from an RTX of the form
6854    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6855    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6856
6857 bool
6858 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6859                                     rtx shft_amnt)
6860 {
6861   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6862          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6863          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6864          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6865 }
6866
6867 /* Calculate the cost of calculating X, storing it in *COST.  Result
6868    is true if the total cost of the operation has now been calculated.  */
6869 static bool
6870 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6871                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6872 {
6873   rtx op0, op1, op2;
6874   const struct cpu_cost_table *extra_cost
6875     = aarch64_tune_params.insn_extra_cost;
6876   int code = GET_CODE (x);
6877   scalar_int_mode int_mode;
6878
6879   /* By default, assume that everything has equivalent cost to the
6880      cheapest instruction.  Any additional costs are applied as a delta
6881      above this default.  */
6882   *cost = COSTS_N_INSNS (1);
6883
6884   switch (code)
6885     {
6886     case SET:
6887       /* The cost depends entirely on the operands to SET.  */
6888       *cost = 0;
6889       op0 = SET_DEST (x);
6890       op1 = SET_SRC (x);
6891
6892       switch (GET_CODE (op0))
6893         {
6894         case MEM:
6895           if (speed)
6896             {
6897               rtx address = XEXP (op0, 0);
6898               if (VECTOR_MODE_P (mode))
6899                 *cost += extra_cost->ldst.storev;
6900               else if (GET_MODE_CLASS (mode) == MODE_INT)
6901                 *cost += extra_cost->ldst.store;
6902               else if (mode == SFmode)
6903                 *cost += extra_cost->ldst.storef;
6904               else if (mode == DFmode)
6905                 *cost += extra_cost->ldst.stored;
6906
6907               *cost +=
6908                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6909                                                      0, speed));
6910             }
6911
6912           *cost += rtx_cost (op1, mode, SET, 1, speed);
6913           return true;
6914
6915         case SUBREG:
6916           if (! REG_P (SUBREG_REG (op0)))
6917             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6918
6919           /* Fall through.  */
6920         case REG:
6921           /* The cost is one per vector-register copied.  */
6922           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6923             {
6924               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6925                               / GET_MODE_SIZE (V4SImode);
6926               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6927             }
6928           /* const0_rtx is in general free, but we will use an
6929              instruction to set a register to 0.  */
6930           else if (REG_P (op1) || op1 == const0_rtx)
6931             {
6932               /* The cost is 1 per register copied.  */
6933               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6934                               / UNITS_PER_WORD;
6935               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6936             }
6937           else
6938             /* Cost is just the cost of the RHS of the set.  */
6939             *cost += rtx_cost (op1, mode, SET, 1, speed);
6940           return true;
6941
6942         case ZERO_EXTRACT:
6943         case SIGN_EXTRACT:
6944           /* Bit-field insertion.  Strip any redundant widening of
6945              the RHS to meet the width of the target.  */
6946           if (GET_CODE (op1) == SUBREG)
6947             op1 = SUBREG_REG (op1);
6948           if ((GET_CODE (op1) == ZERO_EXTEND
6949                || GET_CODE (op1) == SIGN_EXTEND)
6950               && CONST_INT_P (XEXP (op0, 1))
6951               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6952               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6953             op1 = XEXP (op1, 0);
6954
6955           if (CONST_INT_P (op1))
6956             {
6957               /* MOV immediate is assumed to always be cheap.  */
6958               *cost = COSTS_N_INSNS (1);
6959             }
6960           else
6961             {
6962               /* BFM.  */
6963               if (speed)
6964                 *cost += extra_cost->alu.bfi;
6965               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6966             }
6967
6968           return true;
6969
6970         default:
6971           /* We can't make sense of this, assume default cost.  */
6972           *cost = COSTS_N_INSNS (1);
6973           return false;
6974         }
6975       return false;
6976
6977     case CONST_INT:
6978       /* If an instruction can incorporate a constant within the
6979          instruction, the instruction's expression avoids calling
6980          rtx_cost() on the constant.  If rtx_cost() is called on a
6981          constant, then it is usually because the constant must be
6982          moved into a register by one or more instructions.
6983
6984          The exception is constant 0, which can be expressed
6985          as XZR/WZR and is therefore free.  The exception to this is
6986          if we have (set (reg) (const0_rtx)) in which case we must cost
6987          the move.  However, we can catch that when we cost the SET, so
6988          we don't need to consider that here.  */
6989       if (x == const0_rtx)
6990         *cost = 0;
6991       else
6992         {
6993           /* To an approximation, building any other constant is
6994              proportionally expensive to the number of instructions
6995              required to build that constant.  This is true whether we
6996              are compiling for SPEED or otherwise.  */
6997           if (!is_a <scalar_int_mode> (mode, &int_mode))
6998             int_mode = word_mode;
6999           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7000                                  (NULL_RTX, x, false, int_mode));
7001         }
7002       return true;
7003
7004     case CONST_DOUBLE:
7005
7006       /* First determine number of instructions to do the move
7007           as an integer constant.  */
7008       if (!aarch64_float_const_representable_p (x)
7009            && !aarch64_can_const_movi_rtx_p (x, mode)
7010            && aarch64_float_const_rtx_p (x))
7011         {
7012           unsigned HOST_WIDE_INT ival;
7013           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7014           gcc_assert (succeed);
7015
7016           scalar_int_mode imode = (mode == HFmode
7017                                    ? SImode
7018                                    : int_mode_for_mode (mode).require ());
7019           int ncost = aarch64_internal_mov_immediate
7020                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7021           *cost += COSTS_N_INSNS (ncost);
7022           return true;
7023         }
7024
7025       if (speed)
7026         {
7027           /* mov[df,sf]_aarch64.  */
7028           if (aarch64_float_const_representable_p (x))
7029             /* FMOV (scalar immediate).  */
7030             *cost += extra_cost->fp[mode == DFmode].fpconst;
7031           else if (!aarch64_float_const_zero_rtx_p (x))
7032             {
7033               /* This will be a load from memory.  */
7034               if (mode == DFmode)
7035                 *cost += extra_cost->ldst.loadd;
7036               else
7037                 *cost += extra_cost->ldst.loadf;
7038             }
7039           else
7040             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7041                or MOV v0.s[0], wzr - neither of which are modeled by the
7042                cost tables.  Just use the default cost.  */
7043             {
7044             }
7045         }
7046
7047       return true;
7048
7049     case MEM:
7050       if (speed)
7051         {
7052           /* For loads we want the base cost of a load, plus an
7053              approximation for the additional cost of the addressing
7054              mode.  */
7055           rtx address = XEXP (x, 0);
7056           if (VECTOR_MODE_P (mode))
7057             *cost += extra_cost->ldst.loadv;
7058           else if (GET_MODE_CLASS (mode) == MODE_INT)
7059             *cost += extra_cost->ldst.load;
7060           else if (mode == SFmode)
7061             *cost += extra_cost->ldst.loadf;
7062           else if (mode == DFmode)
7063             *cost += extra_cost->ldst.loadd;
7064
7065           *cost +=
7066                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7067                                                      0, speed));
7068         }
7069
7070       return true;
7071
7072     case NEG:
7073       op0 = XEXP (x, 0);
7074
7075       if (VECTOR_MODE_P (mode))
7076         {
7077           if (speed)
7078             {
7079               /* FNEG.  */
7080               *cost += extra_cost->vect.alu;
7081             }
7082           return false;
7083         }
7084
7085       if (GET_MODE_CLASS (mode) == MODE_INT)
7086         {
7087           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7088               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7089             {
7090               /* CSETM.  */
7091               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7092               return true;
7093             }
7094
7095           /* Cost this as SUB wzr, X.  */
7096           op0 = CONST0_RTX (mode);
7097           op1 = XEXP (x, 0);
7098           goto cost_minus;
7099         }
7100
7101       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7102         {
7103           /* Support (neg(fma...)) as a single instruction only if
7104              sign of zeros is unimportant.  This matches the decision
7105              making in aarch64.md.  */
7106           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7107             {
7108               /* FNMADD.  */
7109               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7110               return true;
7111             }
7112           if (GET_CODE (op0) == MULT)
7113             {
7114               /* FNMUL.  */
7115               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7116               return true;
7117             }
7118           if (speed)
7119             /* FNEG.  */
7120             *cost += extra_cost->fp[mode == DFmode].neg;
7121           return false;
7122         }
7123
7124       return false;
7125
7126     case CLRSB:
7127     case CLZ:
7128       if (speed)
7129         {
7130           if (VECTOR_MODE_P (mode))
7131             *cost += extra_cost->vect.alu;
7132           else
7133             *cost += extra_cost->alu.clz;
7134         }
7135
7136       return false;
7137
7138     case COMPARE:
7139       op0 = XEXP (x, 0);
7140       op1 = XEXP (x, 1);
7141
7142       if (op1 == const0_rtx
7143           && GET_CODE (op0) == AND)
7144         {
7145           x = op0;
7146           mode = GET_MODE (op0);
7147           goto cost_logic;
7148         }
7149
7150       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7151         {
7152           /* TODO: A write to the CC flags possibly costs extra, this
7153              needs encoding in the cost tables.  */
7154
7155           mode = GET_MODE (op0);
7156           /* ANDS.  */
7157           if (GET_CODE (op0) == AND)
7158             {
7159               x = op0;
7160               goto cost_logic;
7161             }
7162
7163           if (GET_CODE (op0) == PLUS)
7164             {
7165               /* ADDS (and CMN alias).  */
7166               x = op0;
7167               goto cost_plus;
7168             }
7169
7170           if (GET_CODE (op0) == MINUS)
7171             {
7172               /* SUBS.  */
7173               x = op0;
7174               goto cost_minus;
7175             }
7176
7177           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7178               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7179               && CONST_INT_P (XEXP (op0, 2)))
7180             {
7181               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7182                  Handle it here directly rather than going to cost_logic
7183                  since we know the immediate generated for the TST is valid
7184                  so we can avoid creating an intermediate rtx for it only
7185                  for costing purposes.  */
7186               if (speed)
7187                 *cost += extra_cost->alu.logical;
7188
7189               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7190                                  ZERO_EXTRACT, 0, speed);
7191               return true;
7192             }
7193
7194           if (GET_CODE (op1) == NEG)
7195             {
7196               /* CMN.  */
7197               if (speed)
7198                 *cost += extra_cost->alu.arith;
7199
7200               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7201               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7202               return true;
7203             }
7204
7205           /* CMP.
7206
7207              Compare can freely swap the order of operands, and
7208              canonicalization puts the more complex operation first.
7209              But the integer MINUS logic expects the shift/extend
7210              operation in op1.  */
7211           if (! (REG_P (op0)
7212                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7213           {
7214             op0 = XEXP (x, 1);
7215             op1 = XEXP (x, 0);
7216           }
7217           goto cost_minus;
7218         }
7219
7220       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7221         {
7222           /* FCMP.  */
7223           if (speed)
7224             *cost += extra_cost->fp[mode == DFmode].compare;
7225
7226           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7227             {
7228               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7229               /* FCMP supports constant 0.0 for no extra cost. */
7230               return true;
7231             }
7232           return false;
7233         }
7234
7235       if (VECTOR_MODE_P (mode))
7236         {
7237           /* Vector compare.  */
7238           if (speed)
7239             *cost += extra_cost->vect.alu;
7240
7241           if (aarch64_float_const_zero_rtx_p (op1))
7242             {
7243               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7244                  cost.  */
7245               return true;
7246             }
7247           return false;
7248         }
7249       return false;
7250
7251     case MINUS:
7252       {
7253         op0 = XEXP (x, 0);
7254         op1 = XEXP (x, 1);
7255
7256 cost_minus:
7257         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7258
7259         /* Detect valid immediates.  */
7260         if ((GET_MODE_CLASS (mode) == MODE_INT
7261              || (GET_MODE_CLASS (mode) == MODE_CC
7262                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7263             && CONST_INT_P (op1)
7264             && aarch64_uimm12_shift (INTVAL (op1)))
7265           {
7266             if (speed)
7267               /* SUB(S) (immediate).  */
7268               *cost += extra_cost->alu.arith;
7269             return true;
7270           }
7271
7272         /* Look for SUB (extended register).  */
7273         if (is_a <scalar_int_mode> (mode, &int_mode)
7274             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7275           {
7276             if (speed)
7277               *cost += extra_cost->alu.extend_arith;
7278
7279             op1 = aarch64_strip_extend (op1, true);
7280             *cost += rtx_cost (op1, VOIDmode,
7281                                (enum rtx_code) GET_CODE (op1), 0, speed);
7282             return true;
7283           }
7284
7285         rtx new_op1 = aarch64_strip_extend (op1, false);
7286
7287         /* Cost this as an FMA-alike operation.  */
7288         if ((GET_CODE (new_op1) == MULT
7289              || aarch64_shift_p (GET_CODE (new_op1)))
7290             && code != COMPARE)
7291           {
7292             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7293                                             (enum rtx_code) code,
7294                                             speed);
7295             return true;
7296           }
7297
7298         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7299
7300         if (speed)
7301           {
7302             if (VECTOR_MODE_P (mode))
7303               {
7304                 /* Vector SUB.  */
7305                 *cost += extra_cost->vect.alu;
7306               }
7307             else if (GET_MODE_CLASS (mode) == MODE_INT)
7308               {
7309                 /* SUB(S).  */
7310                 *cost += extra_cost->alu.arith;
7311               }
7312             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7313               {
7314                 /* FSUB.  */
7315                 *cost += extra_cost->fp[mode == DFmode].addsub;
7316               }
7317           }
7318         return true;
7319       }
7320
7321     case PLUS:
7322       {
7323         rtx new_op0;
7324
7325         op0 = XEXP (x, 0);
7326         op1 = XEXP (x, 1);
7327
7328 cost_plus:
7329         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7330             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7331           {
7332             /* CSINC.  */
7333             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7334             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7335             return true;
7336           }
7337
7338         if (GET_MODE_CLASS (mode) == MODE_INT
7339             && CONST_INT_P (op1)
7340             && aarch64_uimm12_shift (INTVAL (op1)))
7341           {
7342             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7343
7344             if (speed)
7345               /* ADD (immediate).  */
7346               *cost += extra_cost->alu.arith;
7347             return true;
7348           }
7349
7350         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7351
7352         /* Look for ADD (extended register).  */
7353         if (is_a <scalar_int_mode> (mode, &int_mode)
7354             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7355           {
7356             if (speed)
7357               *cost += extra_cost->alu.extend_arith;
7358
7359             op0 = aarch64_strip_extend (op0, true);
7360             *cost += rtx_cost (op0, VOIDmode,
7361                                (enum rtx_code) GET_CODE (op0), 0, speed);
7362             return true;
7363           }
7364
7365         /* Strip any extend, leave shifts behind as we will
7366            cost them through mult_cost.  */
7367         new_op0 = aarch64_strip_extend (op0, false);
7368
7369         if (GET_CODE (new_op0) == MULT
7370             || aarch64_shift_p (GET_CODE (new_op0)))
7371           {
7372             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7373                                             speed);
7374             return true;
7375           }
7376
7377         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7378
7379         if (speed)
7380           {
7381             if (VECTOR_MODE_P (mode))
7382               {
7383                 /* Vector ADD.  */
7384                 *cost += extra_cost->vect.alu;
7385               }
7386             else if (GET_MODE_CLASS (mode) == MODE_INT)
7387               {
7388                 /* ADD.  */
7389                 *cost += extra_cost->alu.arith;
7390               }
7391             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7392               {
7393                 /* FADD.  */
7394                 *cost += extra_cost->fp[mode == DFmode].addsub;
7395               }
7396           }
7397         return true;
7398       }
7399
7400     case BSWAP:
7401       *cost = COSTS_N_INSNS (1);
7402
7403       if (speed)
7404         {
7405           if (VECTOR_MODE_P (mode))
7406             *cost += extra_cost->vect.alu;
7407           else
7408             *cost += extra_cost->alu.rev;
7409         }
7410       return false;
7411
7412     case IOR:
7413       if (aarch_rev16_p (x))
7414         {
7415           *cost = COSTS_N_INSNS (1);
7416
7417           if (speed)
7418             {
7419               if (VECTOR_MODE_P (mode))
7420                 *cost += extra_cost->vect.alu;
7421               else
7422                 *cost += extra_cost->alu.rev;
7423             }
7424           return true;
7425         }
7426
7427       if (aarch64_extr_rtx_p (x, &op0, &op1))
7428         {
7429           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7430           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7431           if (speed)
7432             *cost += extra_cost->alu.shift;
7433
7434           return true;
7435         }
7436     /* Fall through.  */
7437     case XOR:
7438     case AND:
7439     cost_logic:
7440       op0 = XEXP (x, 0);
7441       op1 = XEXP (x, 1);
7442
7443       if (VECTOR_MODE_P (mode))
7444         {
7445           if (speed)
7446             *cost += extra_cost->vect.alu;
7447           return true;
7448         }
7449
7450       if (code == AND
7451           && GET_CODE (op0) == MULT
7452           && CONST_INT_P (XEXP (op0, 1))
7453           && CONST_INT_P (op1)
7454           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7455                                INTVAL (op1)) != 0)
7456         {
7457           /* This is a UBFM/SBFM.  */
7458           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7459           if (speed)
7460             *cost += extra_cost->alu.bfx;
7461           return true;
7462         }
7463
7464       if (is_int_mode (mode, &int_mode))
7465         {
7466           if (CONST_INT_P (op1))
7467             {
7468               /* We have a mask + shift version of a UBFIZ
7469                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7470               if (GET_CODE (op0) == ASHIFT
7471                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7472                                                          XEXP (op0, 1)))
7473                 {
7474                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7475                                      (enum rtx_code) code, 0, speed);
7476                   if (speed)
7477                     *cost += extra_cost->alu.bfx;
7478
7479                   return true;
7480                 }
7481               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7482                 {
7483                 /* We possibly get the immediate for free, this is not
7484                    modelled.  */
7485                   *cost += rtx_cost (op0, int_mode,
7486                                      (enum rtx_code) code, 0, speed);
7487                   if (speed)
7488                     *cost += extra_cost->alu.logical;
7489
7490                   return true;
7491                 }
7492             }
7493           else
7494             {
7495               rtx new_op0 = op0;
7496
7497               /* Handle ORN, EON, or BIC.  */
7498               if (GET_CODE (op0) == NOT)
7499                 op0 = XEXP (op0, 0);
7500
7501               new_op0 = aarch64_strip_shift (op0);
7502
7503               /* If we had a shift on op0 then this is a logical-shift-
7504                  by-register/immediate operation.  Otherwise, this is just
7505                  a logical operation.  */
7506               if (speed)
7507                 {
7508                   if (new_op0 != op0)
7509                     {
7510                       /* Shift by immediate.  */
7511                       if (CONST_INT_P (XEXP (op0, 1)))
7512                         *cost += extra_cost->alu.log_shift;
7513                       else
7514                         *cost += extra_cost->alu.log_shift_reg;
7515                     }
7516                   else
7517                     *cost += extra_cost->alu.logical;
7518                 }
7519
7520               /* In both cases we want to cost both operands.  */
7521               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7522                                  0, speed);
7523               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7524                                  1, speed);
7525
7526               return true;
7527             }
7528         }
7529       return false;
7530
7531     case NOT:
7532       x = XEXP (x, 0);
7533       op0 = aarch64_strip_shift (x);
7534
7535       if (VECTOR_MODE_P (mode))
7536         {
7537           /* Vector NOT.  */
7538           *cost += extra_cost->vect.alu;
7539           return false;
7540         }
7541
7542       /* MVN-shifted-reg.  */
7543       if (op0 != x)
7544         {
7545           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7546
7547           if (speed)
7548             *cost += extra_cost->alu.log_shift;
7549
7550           return true;
7551         }
7552       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7553          Handle the second form here taking care that 'a' in the above can
7554          be a shift.  */
7555       else if (GET_CODE (op0) == XOR)
7556         {
7557           rtx newop0 = XEXP (op0, 0);
7558           rtx newop1 = XEXP (op0, 1);
7559           rtx op0_stripped = aarch64_strip_shift (newop0);
7560
7561           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7562           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7563
7564           if (speed)
7565             {
7566               if (op0_stripped != newop0)
7567                 *cost += extra_cost->alu.log_shift;
7568               else
7569                 *cost += extra_cost->alu.logical;
7570             }
7571
7572           return true;
7573         }
7574       /* MVN.  */
7575       if (speed)
7576         *cost += extra_cost->alu.logical;
7577
7578       return false;
7579
7580     case ZERO_EXTEND:
7581
7582       op0 = XEXP (x, 0);
7583       /* If a value is written in SI mode, then zero extended to DI
7584          mode, the operation will in general be free as a write to
7585          a 'w' register implicitly zeroes the upper bits of an 'x'
7586          register.  However, if this is
7587
7588            (set (reg) (zero_extend (reg)))
7589
7590          we must cost the explicit register move.  */
7591       if (mode == DImode
7592           && GET_MODE (op0) == SImode
7593           && outer == SET)
7594         {
7595           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7596
7597         /* If OP_COST is non-zero, then the cost of the zero extend
7598            is effectively the cost of the inner operation.  Otherwise
7599            we have a MOV instruction and we take the cost from the MOV
7600            itself.  This is true independently of whether we are
7601            optimizing for space or time.  */
7602           if (op_cost)
7603             *cost = op_cost;
7604
7605           return true;
7606         }
7607       else if (MEM_P (op0))
7608         {
7609           /* All loads can zero extend to any size for free.  */
7610           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7611           return true;
7612         }
7613
7614       op0 = aarch64_extend_bitfield_pattern_p (x);
7615       if (op0)
7616         {
7617           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7618           if (speed)
7619             *cost += extra_cost->alu.bfx;
7620           return true;
7621         }
7622
7623       if (speed)
7624         {
7625           if (VECTOR_MODE_P (mode))
7626             {
7627               /* UMOV.  */
7628               *cost += extra_cost->vect.alu;
7629             }
7630           else
7631             {
7632               /* We generate an AND instead of UXTB/UXTH.  */
7633               *cost += extra_cost->alu.logical;
7634             }
7635         }
7636       return false;
7637
7638     case SIGN_EXTEND:
7639       if (MEM_P (XEXP (x, 0)))
7640         {
7641           /* LDRSH.  */
7642           if (speed)
7643             {
7644               rtx address = XEXP (XEXP (x, 0), 0);
7645               *cost += extra_cost->ldst.load_sign_extend;
7646
7647               *cost +=
7648                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7649                                                      0, speed));
7650             }
7651           return true;
7652         }
7653
7654       op0 = aarch64_extend_bitfield_pattern_p (x);
7655       if (op0)
7656         {
7657           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7658           if (speed)
7659             *cost += extra_cost->alu.bfx;
7660           return true;
7661         }
7662
7663       if (speed)
7664         {
7665           if (VECTOR_MODE_P (mode))
7666             *cost += extra_cost->vect.alu;
7667           else
7668             *cost += extra_cost->alu.extend;
7669         }
7670       return false;
7671
7672     case ASHIFT:
7673       op0 = XEXP (x, 0);
7674       op1 = XEXP (x, 1);
7675
7676       if (CONST_INT_P (op1))
7677         {
7678           if (speed)
7679             {
7680               if (VECTOR_MODE_P (mode))
7681                 {
7682                   /* Vector shift (immediate).  */
7683                   *cost += extra_cost->vect.alu;
7684                 }
7685               else
7686                 {
7687                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7688                      aliases.  */
7689                   *cost += extra_cost->alu.shift;
7690                 }
7691             }
7692
7693           /* We can incorporate zero/sign extend for free.  */
7694           if (GET_CODE (op0) == ZERO_EXTEND
7695               || GET_CODE (op0) == SIGN_EXTEND)
7696             op0 = XEXP (op0, 0);
7697
7698           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7699           return true;
7700         }
7701       else
7702         {
7703           if (VECTOR_MODE_P (mode))
7704             {
7705               if (speed)
7706                 /* Vector shift (register).  */
7707                 *cost += extra_cost->vect.alu;
7708             }
7709           else
7710             {
7711               if (speed)
7712                 /* LSLV.  */
7713                 *cost += extra_cost->alu.shift_reg;
7714
7715               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7716                   && CONST_INT_P (XEXP (op1, 1))
7717                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7718                 {
7719                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7720                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7721                      don't recurse into it.  */
7722                   return true;
7723                 }
7724             }
7725           return false;  /* All arguments need to be in registers.  */
7726         }
7727
7728     case ROTATE:
7729     case ROTATERT:
7730     case LSHIFTRT:
7731     case ASHIFTRT:
7732       op0 = XEXP (x, 0);
7733       op1 = XEXP (x, 1);
7734
7735       if (CONST_INT_P (op1))
7736         {
7737           /* ASR (immediate) and friends.  */
7738           if (speed)
7739             {
7740               if (VECTOR_MODE_P (mode))
7741                 *cost += extra_cost->vect.alu;
7742               else
7743                 *cost += extra_cost->alu.shift;
7744             }
7745
7746           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7747           return true;
7748         }
7749       else
7750         {
7751           if (VECTOR_MODE_P (mode))
7752             {
7753               if (speed)
7754                 /* Vector shift (register).  */
7755                 *cost += extra_cost->vect.alu;
7756             }
7757           else
7758             {
7759               if (speed)
7760                 /* ASR (register) and friends.  */
7761                 *cost += extra_cost->alu.shift_reg;
7762
7763               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7764                   && CONST_INT_P (XEXP (op1, 1))
7765                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7766                 {
7767                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7768                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7769                      don't recurse into it.  */
7770                   return true;
7771                 }
7772             }
7773           return false;  /* All arguments need to be in registers.  */
7774         }
7775
7776     case SYMBOL_REF:
7777
7778       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7779           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7780         {
7781           /* LDR.  */
7782           if (speed)
7783             *cost += extra_cost->ldst.load;
7784         }
7785       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7786                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7787         {
7788           /* ADRP, followed by ADD.  */
7789           *cost += COSTS_N_INSNS (1);
7790           if (speed)
7791             *cost += 2 * extra_cost->alu.arith;
7792         }
7793       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7794                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7795         {
7796           /* ADR.  */
7797           if (speed)
7798             *cost += extra_cost->alu.arith;
7799         }
7800
7801       if (flag_pic)
7802         {
7803           /* One extra load instruction, after accessing the GOT.  */
7804           *cost += COSTS_N_INSNS (1);
7805           if (speed)
7806             *cost += extra_cost->ldst.load;
7807         }
7808       return true;
7809
7810     case HIGH:
7811     case LO_SUM:
7812       /* ADRP/ADD (immediate).  */
7813       if (speed)
7814         *cost += extra_cost->alu.arith;
7815       return true;
7816
7817     case ZERO_EXTRACT:
7818     case SIGN_EXTRACT:
7819       /* UBFX/SBFX.  */
7820       if (speed)
7821         {
7822           if (VECTOR_MODE_P (mode))
7823             *cost += extra_cost->vect.alu;
7824           else
7825             *cost += extra_cost->alu.bfx;
7826         }
7827
7828       /* We can trust that the immediates used will be correct (there
7829          are no by-register forms), so we need only cost op0.  */
7830       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7831       return true;
7832
7833     case MULT:
7834       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7835       /* aarch64_rtx_mult_cost always handles recursion to its
7836          operands.  */
7837       return true;
7838
7839     case MOD:
7840     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7841        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7842        an unconditional negate.  This case should only ever be reached through
7843        the set_smod_pow2_cheap check in expmed.c.  */
7844       if (CONST_INT_P (XEXP (x, 1))
7845           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7846           && (mode == SImode || mode == DImode))
7847         {
7848           /* We expand to 4 instructions.  Reset the baseline.  */
7849           *cost = COSTS_N_INSNS (4);
7850
7851           if (speed)
7852             *cost += 2 * extra_cost->alu.logical
7853                      + 2 * extra_cost->alu.arith;
7854
7855           return true;
7856         }
7857
7858     /* Fall-through.  */
7859     case UMOD:
7860       if (speed)
7861         {
7862           /* Slighly prefer UMOD over SMOD.  */
7863           if (VECTOR_MODE_P (mode))
7864             *cost += extra_cost->vect.alu;
7865           else if (GET_MODE_CLASS (mode) == MODE_INT)
7866             *cost += (extra_cost->mult[mode == DImode].add
7867                       + extra_cost->mult[mode == DImode].idiv
7868                       + (code == MOD ? 1 : 0));
7869         }
7870       return false;  /* All arguments need to be in registers.  */
7871
7872     case DIV:
7873     case UDIV:
7874     case SQRT:
7875       if (speed)
7876         {
7877           if (VECTOR_MODE_P (mode))
7878             *cost += extra_cost->vect.alu;
7879           else if (GET_MODE_CLASS (mode) == MODE_INT)
7880             /* There is no integer SQRT, so only DIV and UDIV can get
7881                here.  */
7882             *cost += (extra_cost->mult[mode == DImode].idiv
7883                      /* Slighly prefer UDIV over SDIV.  */
7884                      + (code == DIV ? 1 : 0));
7885           else
7886             *cost += extra_cost->fp[mode == DFmode].div;
7887         }
7888       return false;  /* All arguments need to be in registers.  */
7889
7890     case IF_THEN_ELSE:
7891       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7892                                          XEXP (x, 2), cost, speed);
7893
7894     case EQ:
7895     case NE:
7896     case GT:
7897     case GTU:
7898     case LT:
7899     case LTU:
7900     case GE:
7901     case GEU:
7902     case LE:
7903     case LEU:
7904
7905       return false; /* All arguments must be in registers.  */
7906
7907     case FMA:
7908       op0 = XEXP (x, 0);
7909       op1 = XEXP (x, 1);
7910       op2 = XEXP (x, 2);
7911
7912       if (speed)
7913         {
7914           if (VECTOR_MODE_P (mode))
7915             *cost += extra_cost->vect.alu;
7916           else
7917             *cost += extra_cost->fp[mode == DFmode].fma;
7918         }
7919
7920       /* FMSUB, FNMADD, and FNMSUB are free.  */
7921       if (GET_CODE (op0) == NEG)
7922         op0 = XEXP (op0, 0);
7923
7924       if (GET_CODE (op2) == NEG)
7925         op2 = XEXP (op2, 0);
7926
7927       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7928          and the by-element operand as operand 0.  */
7929       if (GET_CODE (op1) == NEG)
7930         op1 = XEXP (op1, 0);
7931
7932       /* Catch vector-by-element operations.  The by-element operand can
7933          either be (vec_duplicate (vec_select (x))) or just
7934          (vec_select (x)), depending on whether we are multiplying by
7935          a vector or a scalar.
7936
7937          Canonicalization is not very good in these cases, FMA4 will put the
7938          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7939       if (GET_CODE (op0) == VEC_DUPLICATE)
7940         op0 = XEXP (op0, 0);
7941       else if (GET_CODE (op1) == VEC_DUPLICATE)
7942         op1 = XEXP (op1, 0);
7943
7944       if (GET_CODE (op0) == VEC_SELECT)
7945         op0 = XEXP (op0, 0);
7946       else if (GET_CODE (op1) == VEC_SELECT)
7947         op1 = XEXP (op1, 0);
7948
7949       /* If the remaining parameters are not registers,
7950          get the cost to put them into registers.  */
7951       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7952       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7953       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7954       return true;
7955
7956     case FLOAT:
7957     case UNSIGNED_FLOAT:
7958       if (speed)
7959         *cost += extra_cost->fp[mode == DFmode].fromint;
7960       return false;
7961
7962     case FLOAT_EXTEND:
7963       if (speed)
7964         {
7965           if (VECTOR_MODE_P (mode))
7966             {
7967               /*Vector truncate.  */
7968               *cost += extra_cost->vect.alu;
7969             }
7970           else
7971             *cost += extra_cost->fp[mode == DFmode].widen;
7972         }
7973       return false;
7974
7975     case FLOAT_TRUNCATE:
7976       if (speed)
7977         {
7978           if (VECTOR_MODE_P (mode))
7979             {
7980               /*Vector conversion.  */
7981               *cost += extra_cost->vect.alu;
7982             }
7983           else
7984             *cost += extra_cost->fp[mode == DFmode].narrow;
7985         }
7986       return false;
7987
7988     case FIX:
7989     case UNSIGNED_FIX:
7990       x = XEXP (x, 0);
7991       /* Strip the rounding part.  They will all be implemented
7992          by the fcvt* family of instructions anyway.  */
7993       if (GET_CODE (x) == UNSPEC)
7994         {
7995           unsigned int uns_code = XINT (x, 1);
7996
7997           if (uns_code == UNSPEC_FRINTA
7998               || uns_code == UNSPEC_FRINTM
7999               || uns_code == UNSPEC_FRINTN
8000               || uns_code == UNSPEC_FRINTP
8001               || uns_code == UNSPEC_FRINTZ)
8002             x = XVECEXP (x, 0, 0);
8003         }
8004
8005       if (speed)
8006         {
8007           if (VECTOR_MODE_P (mode))
8008             *cost += extra_cost->vect.alu;
8009           else
8010             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8011         }
8012
8013       /* We can combine fmul by a power of 2 followed by a fcvt into a single
8014          fixed-point fcvt.  */
8015       if (GET_CODE (x) == MULT
8016           && ((VECTOR_MODE_P (mode)
8017                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8018               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8019         {
8020           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8021                              0, speed);
8022           return true;
8023         }
8024
8025       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8026       return true;
8027
8028     case ABS:
8029       if (VECTOR_MODE_P (mode))
8030         {
8031           /* ABS (vector).  */
8032           if (speed)
8033             *cost += extra_cost->vect.alu;
8034         }
8035       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8036         {
8037           op0 = XEXP (x, 0);
8038
8039           /* FABD, which is analogous to FADD.  */
8040           if (GET_CODE (op0) == MINUS)
8041             {
8042               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8043               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8044               if (speed)
8045                 *cost += extra_cost->fp[mode == DFmode].addsub;
8046
8047               return true;
8048             }
8049           /* Simple FABS is analogous to FNEG.  */
8050           if (speed)
8051             *cost += extra_cost->fp[mode == DFmode].neg;
8052         }
8053       else
8054         {
8055           /* Integer ABS will either be split to
8056              two arithmetic instructions, or will be an ABS
8057              (scalar), which we don't model.  */
8058           *cost = COSTS_N_INSNS (2);
8059           if (speed)
8060             *cost += 2 * extra_cost->alu.arith;
8061         }
8062       return false;
8063
8064     case SMAX:
8065     case SMIN:
8066       if (speed)
8067         {
8068           if (VECTOR_MODE_P (mode))
8069             *cost += extra_cost->vect.alu;
8070           else
8071             {
8072               /* FMAXNM/FMINNM/FMAX/FMIN.
8073                  TODO: This may not be accurate for all implementations, but
8074                  we do not model this in the cost tables.  */
8075               *cost += extra_cost->fp[mode == DFmode].addsub;
8076             }
8077         }
8078       return false;
8079
8080     case UNSPEC:
8081       /* The floating point round to integer frint* instructions.  */
8082       if (aarch64_frint_unspec_p (XINT (x, 1)))
8083         {
8084           if (speed)
8085             *cost += extra_cost->fp[mode == DFmode].roundint;
8086
8087           return false;
8088         }
8089
8090       if (XINT (x, 1) == UNSPEC_RBIT)
8091         {
8092           if (speed)
8093             *cost += extra_cost->alu.rev;
8094
8095           return false;
8096         }
8097       break;
8098
8099     case TRUNCATE:
8100
8101       /* Decompose <su>muldi3_highpart.  */
8102       if (/* (truncate:DI  */
8103           mode == DImode
8104           /*   (lshiftrt:TI  */
8105           && GET_MODE (XEXP (x, 0)) == TImode
8106           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8107           /*      (mult:TI  */
8108           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8109           /*        (ANY_EXTEND:TI (reg:DI))
8110                     (ANY_EXTEND:TI (reg:DI)))  */
8111           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8112                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8113               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8114                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8115           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8116           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8117           /*     (const_int 64)  */
8118           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8119           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8120         {
8121           /* UMULH/SMULH.  */
8122           if (speed)
8123             *cost += extra_cost->mult[mode == DImode].extend;
8124           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8125                              mode, MULT, 0, speed);
8126           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8127                              mode, MULT, 1, speed);
8128           return true;
8129         }
8130
8131       /* Fall through.  */
8132     default:
8133       break;
8134     }
8135
8136   if (dump_file
8137       && flag_aarch64_verbose_cost)
8138     fprintf (dump_file,
8139       "\nFailed to cost RTX.  Assuming default cost.\n");
8140
8141   return true;
8142 }
8143
8144 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8145    calculated for X.  This cost is stored in *COST.  Returns true
8146    if the total cost of X was calculated.  */
8147 static bool
8148 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8149                    int param, int *cost, bool speed)
8150 {
8151   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8152
8153   if (dump_file
8154       && flag_aarch64_verbose_cost)
8155     {
8156       print_rtl_single (dump_file, x);
8157       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8158                speed ? "Hot" : "Cold",
8159                *cost, result ? "final" : "partial");
8160     }
8161
8162   return result;
8163 }
8164
8165 static int
8166 aarch64_register_move_cost (machine_mode mode,
8167                             reg_class_t from_i, reg_class_t to_i)
8168 {
8169   enum reg_class from = (enum reg_class) from_i;
8170   enum reg_class to = (enum reg_class) to_i;
8171   const struct cpu_regmove_cost *regmove_cost
8172     = aarch64_tune_params.regmove_cost;
8173
8174   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8175   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8176     to = GENERAL_REGS;
8177
8178   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8179     from = GENERAL_REGS;
8180
8181   /* Moving between GPR and stack cost is the same as GP2GP.  */
8182   if ((from == GENERAL_REGS && to == STACK_REG)
8183       || (to == GENERAL_REGS && from == STACK_REG))
8184     return regmove_cost->GP2GP;
8185
8186   /* To/From the stack register, we move via the gprs.  */
8187   if (to == STACK_REG || from == STACK_REG)
8188     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8189             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8190
8191   if (GET_MODE_SIZE (mode) == 16)
8192     {
8193       /* 128-bit operations on general registers require 2 instructions.  */
8194       if (from == GENERAL_REGS && to == GENERAL_REGS)
8195         return regmove_cost->GP2GP * 2;
8196       else if (from == GENERAL_REGS)
8197         return regmove_cost->GP2FP * 2;
8198       else if (to == GENERAL_REGS)
8199         return regmove_cost->FP2GP * 2;
8200
8201       /* When AdvSIMD instructions are disabled it is not possible to move
8202          a 128-bit value directly between Q registers.  This is handled in
8203          secondary reload.  A general register is used as a scratch to move
8204          the upper DI value and the lower DI value is moved directly,
8205          hence the cost is the sum of three moves. */
8206       if (! TARGET_SIMD)
8207         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8208
8209       return regmove_cost->FP2FP;
8210     }
8211
8212   if (from == GENERAL_REGS && to == GENERAL_REGS)
8213     return regmove_cost->GP2GP;
8214   else if (from == GENERAL_REGS)
8215     return regmove_cost->GP2FP;
8216   else if (to == GENERAL_REGS)
8217     return regmove_cost->FP2GP;
8218
8219   return regmove_cost->FP2FP;
8220 }
8221
8222 static int
8223 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8224                           reg_class_t rclass ATTRIBUTE_UNUSED,
8225                           bool in ATTRIBUTE_UNUSED)
8226 {
8227   return aarch64_tune_params.memmov_cost;
8228 }
8229
8230 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8231    to optimize 1.0/sqrt.  */
8232
8233 static bool
8234 use_rsqrt_p (machine_mode mode)
8235 {
8236   return (!flag_trapping_math
8237           && flag_unsafe_math_optimizations
8238           && ((aarch64_tune_params.approx_modes->recip_sqrt
8239                & AARCH64_APPROX_MODE (mode))
8240               || flag_mrecip_low_precision_sqrt));
8241 }
8242
8243 /* Function to decide when to use the approximate reciprocal square root
8244    builtin.  */
8245
8246 static tree
8247 aarch64_builtin_reciprocal (tree fndecl)
8248 {
8249   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8250
8251   if (!use_rsqrt_p (mode))
8252     return NULL_TREE;
8253   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8254 }
8255
8256 typedef rtx (*rsqrte_type) (rtx, rtx);
8257
8258 /* Select reciprocal square root initial estimate insn depending on machine
8259    mode.  */
8260
8261 static rsqrte_type
8262 get_rsqrte_type (machine_mode mode)
8263 {
8264   switch (mode)
8265   {
8266     case E_DFmode:   return gen_aarch64_rsqrtedf;
8267     case E_SFmode:   return gen_aarch64_rsqrtesf;
8268     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8269     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8270     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8271     default: gcc_unreachable ();
8272   }
8273 }
8274
8275 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8276
8277 /* Select reciprocal square root series step insn depending on machine mode.  */
8278
8279 static rsqrts_type
8280 get_rsqrts_type (machine_mode mode)
8281 {
8282   switch (mode)
8283   {
8284     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8285     case E_SFmode:   return gen_aarch64_rsqrtssf;
8286     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8287     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8288     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8289     default: gcc_unreachable ();
8290   }
8291 }
8292
8293 /* Emit instruction sequence to compute either the approximate square root
8294    or its approximate reciprocal, depending on the flag RECP, and return
8295    whether the sequence was emitted or not.  */
8296
8297 bool
8298 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8299 {
8300   machine_mode mode = GET_MODE (dst);
8301
8302   if (GET_MODE_INNER (mode) == HFmode)
8303     {
8304       gcc_assert (!recp);
8305       return false;
8306     }
8307
8308   if (!recp)
8309     {
8310       if (!(flag_mlow_precision_sqrt
8311             || (aarch64_tune_params.approx_modes->sqrt
8312                 & AARCH64_APPROX_MODE (mode))))
8313         return false;
8314
8315       if (flag_finite_math_only
8316           || flag_trapping_math
8317           || !flag_unsafe_math_optimizations
8318           || optimize_function_for_size_p (cfun))
8319         return false;
8320     }
8321   else
8322     /* Caller assumes we cannot fail.  */
8323     gcc_assert (use_rsqrt_p (mode));
8324
8325   machine_mode mmsk = mode_for_int_vector (mode).require ();
8326   rtx xmsk = gen_reg_rtx (mmsk);
8327   if (!recp)
8328     /* When calculating the approximate square root, compare the
8329        argument with 0.0 and create a mask.  */
8330     emit_insn (gen_rtx_SET (xmsk,
8331                             gen_rtx_NEG (mmsk,
8332                                          gen_rtx_EQ (mmsk, src,
8333                                                      CONST0_RTX (mode)))));
8334
8335   /* Estimate the approximate reciprocal square root.  */
8336   rtx xdst = gen_reg_rtx (mode);
8337   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8338
8339   /* Iterate over the series twice for SF and thrice for DF.  */
8340   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8341
8342   /* Optionally iterate over the series once less for faster performance
8343      while sacrificing the accuracy.  */
8344   if ((recp && flag_mrecip_low_precision_sqrt)
8345       || (!recp && flag_mlow_precision_sqrt))
8346     iterations--;
8347
8348   /* Iterate over the series to calculate the approximate reciprocal square
8349      root.  */
8350   rtx x1 = gen_reg_rtx (mode);
8351   while (iterations--)
8352     {
8353       rtx x2 = gen_reg_rtx (mode);
8354       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8355
8356       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8357
8358       if (iterations > 0)
8359         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8360     }
8361
8362   if (!recp)
8363     {
8364       /* Qualify the approximate reciprocal square root when the argument is
8365          0.0 by squashing the intermediary result to 0.0.  */
8366       rtx xtmp = gen_reg_rtx (mmsk);
8367       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8368                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8369       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8370
8371       /* Calculate the approximate square root.  */
8372       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8373     }
8374
8375   /* Finalize the approximation.  */
8376   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8377
8378   return true;
8379 }
8380
8381 typedef rtx (*recpe_type) (rtx, rtx);
8382
8383 /* Select reciprocal initial estimate insn depending on machine mode.  */
8384
8385 static recpe_type
8386 get_recpe_type (machine_mode mode)
8387 {
8388   switch (mode)
8389   {
8390     case E_SFmode:   return (gen_aarch64_frecpesf);
8391     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8392     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8393     case E_DFmode:   return (gen_aarch64_frecpedf);
8394     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8395     default:         gcc_unreachable ();
8396   }
8397 }
8398
8399 typedef rtx (*recps_type) (rtx, rtx, rtx);
8400
8401 /* Select reciprocal series step insn depending on machine mode.  */
8402
8403 static recps_type
8404 get_recps_type (machine_mode mode)
8405 {
8406   switch (mode)
8407   {
8408     case E_SFmode:   return (gen_aarch64_frecpssf);
8409     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8410     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8411     case E_DFmode:   return (gen_aarch64_frecpsdf);
8412     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8413     default:         gcc_unreachable ();
8414   }
8415 }
8416
8417 /* Emit the instruction sequence to compute the approximation for the division
8418    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8419
8420 bool
8421 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8422 {
8423   machine_mode mode = GET_MODE (quo);
8424
8425   if (GET_MODE_INNER (mode) == HFmode)
8426     return false;
8427
8428   bool use_approx_division_p = (flag_mlow_precision_div
8429                                 || (aarch64_tune_params.approx_modes->division
8430                                     & AARCH64_APPROX_MODE (mode)));
8431
8432   if (!flag_finite_math_only
8433       || flag_trapping_math
8434       || !flag_unsafe_math_optimizations
8435       || optimize_function_for_size_p (cfun)
8436       || !use_approx_division_p)
8437     return false;
8438
8439   /* Estimate the approximate reciprocal.  */
8440   rtx xrcp = gen_reg_rtx (mode);
8441   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8442
8443   /* Iterate over the series twice for SF and thrice for DF.  */
8444   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8445
8446   /* Optionally iterate over the series once less for faster performance,
8447      while sacrificing the accuracy.  */
8448   if (flag_mlow_precision_div)
8449     iterations--;
8450
8451   /* Iterate over the series to calculate the approximate reciprocal.  */
8452   rtx xtmp = gen_reg_rtx (mode);
8453   while (iterations--)
8454     {
8455       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8456
8457       if (iterations > 0)
8458         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8459     }
8460
8461   if (num != CONST1_RTX (mode))
8462     {
8463       /* As the approximate reciprocal of DEN is already calculated, only
8464          calculate the approximate division when NUM is not 1.0.  */
8465       rtx xnum = force_reg (mode, num);
8466       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8467     }
8468
8469   /* Finalize the approximation.  */
8470   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8471   return true;
8472 }
8473
8474 /* Return the number of instructions that can be issued per cycle.  */
8475 static int
8476 aarch64_sched_issue_rate (void)
8477 {
8478   return aarch64_tune_params.issue_rate;
8479 }
8480
8481 static int
8482 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8483 {
8484   int issue_rate = aarch64_sched_issue_rate ();
8485
8486   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8487 }
8488
8489
8490 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8491    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8492    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8493
8494 static int
8495 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8496                                                     int ready_index)
8497 {
8498   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8499 }
8500
8501
8502 /* Vectorizer cost model target hooks.  */
8503
8504 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8505 static int
8506 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8507                                     tree vectype,
8508                                     int misalign ATTRIBUTE_UNUSED)
8509 {
8510   unsigned elements;
8511   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8512   bool fp = false;
8513
8514   if (vectype != NULL)
8515     fp = FLOAT_TYPE_P (vectype);
8516
8517   switch (type_of_cost)
8518     {
8519       case scalar_stmt:
8520         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8521
8522       case scalar_load:
8523         return costs->scalar_load_cost;
8524
8525       case scalar_store:
8526         return costs->scalar_store_cost;
8527
8528       case vector_stmt:
8529         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8530
8531       case vector_load:
8532         return costs->vec_align_load_cost;
8533
8534       case vector_store:
8535         return costs->vec_store_cost;
8536
8537       case vec_to_scalar:
8538         return costs->vec_to_scalar_cost;
8539
8540       case scalar_to_vec:
8541         return costs->scalar_to_vec_cost;
8542
8543       case unaligned_load:
8544         return costs->vec_unalign_load_cost;
8545
8546       case unaligned_store:
8547         return costs->vec_unalign_store_cost;
8548
8549       case cond_branch_taken:
8550         return costs->cond_taken_branch_cost;
8551
8552       case cond_branch_not_taken:
8553         return costs->cond_not_taken_branch_cost;
8554
8555       case vec_perm:
8556         return costs->vec_permute_cost;
8557
8558       case vec_promote_demote:
8559         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8560
8561       case vec_construct:
8562         elements = TYPE_VECTOR_SUBPARTS (vectype);
8563         return elements / 2 + 1;
8564
8565       default:
8566         gcc_unreachable ();
8567     }
8568 }
8569
8570 /* Implement targetm.vectorize.add_stmt_cost.  */
8571 static unsigned
8572 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8573                        struct _stmt_vec_info *stmt_info, int misalign,
8574                        enum vect_cost_model_location where)
8575 {
8576   unsigned *cost = (unsigned *) data;
8577   unsigned retval = 0;
8578
8579   if (flag_vect_cost_model)
8580     {
8581       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8582       int stmt_cost =
8583             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8584
8585       /* Statements in an inner loop relative to the loop being
8586          vectorized are weighted more heavily.  The value here is
8587          arbitrary and could potentially be improved with analysis.  */
8588       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8589         count *= 50; /*  FIXME  */
8590
8591       retval = (unsigned) (count * stmt_cost);
8592       cost[where] += retval;
8593     }
8594
8595   return retval;
8596 }
8597
8598 static void initialize_aarch64_code_model (struct gcc_options *);
8599
8600 /* Parse the TO_PARSE string and put the architecture struct that it
8601    selects into RES and the architectural features into ISA_FLAGS.
8602    Return an aarch64_parse_opt_result describing the parse result.
8603    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8604
8605 static enum aarch64_parse_opt_result
8606 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8607                     unsigned long *isa_flags)
8608 {
8609   char *ext;
8610   const struct processor *arch;
8611   char *str = (char *) alloca (strlen (to_parse) + 1);
8612   size_t len;
8613
8614   strcpy (str, to_parse);
8615
8616   ext = strchr (str, '+');
8617
8618   if (ext != NULL)
8619     len = ext - str;
8620   else
8621     len = strlen (str);
8622
8623   if (len == 0)
8624     return AARCH64_PARSE_MISSING_ARG;
8625
8626
8627   /* Loop through the list of supported ARCHes to find a match.  */
8628   for (arch = all_architectures; arch->name != NULL; arch++)
8629     {
8630       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8631         {
8632           unsigned long isa_temp = arch->flags;
8633
8634           if (ext != NULL)
8635             {
8636               /* TO_PARSE string contains at least one extension.  */
8637               enum aarch64_parse_opt_result ext_res
8638                 = aarch64_parse_extension (ext, &isa_temp);
8639
8640               if (ext_res != AARCH64_PARSE_OK)
8641                 return ext_res;
8642             }
8643           /* Extension parsing was successful.  Confirm the result
8644              arch and ISA flags.  */
8645           *res = arch;
8646           *isa_flags = isa_temp;
8647           return AARCH64_PARSE_OK;
8648         }
8649     }
8650
8651   /* ARCH name not found in list.  */
8652   return AARCH64_PARSE_INVALID_ARG;
8653 }
8654
8655 /* Parse the TO_PARSE string and put the result tuning in RES and the
8656    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8657    describing the parse result.  If there is an error parsing, RES and
8658    ISA_FLAGS are left unchanged.  */
8659
8660 static enum aarch64_parse_opt_result
8661 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8662                    unsigned long *isa_flags)
8663 {
8664   char *ext;
8665   const struct processor *cpu;
8666   char *str = (char *) alloca (strlen (to_parse) + 1);
8667   size_t len;
8668
8669   strcpy (str, to_parse);
8670
8671   ext = strchr (str, '+');
8672
8673   if (ext != NULL)
8674     len = ext - str;
8675   else
8676     len = strlen (str);
8677
8678   if (len == 0)
8679     return AARCH64_PARSE_MISSING_ARG;
8680
8681
8682   /* Loop through the list of supported CPUs to find a match.  */
8683   for (cpu = all_cores; cpu->name != NULL; cpu++)
8684     {
8685       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8686         {
8687           unsigned long isa_temp = cpu->flags;
8688
8689
8690           if (ext != NULL)
8691             {
8692               /* TO_PARSE string contains at least one extension.  */
8693               enum aarch64_parse_opt_result ext_res
8694                 = aarch64_parse_extension (ext, &isa_temp);
8695
8696               if (ext_res != AARCH64_PARSE_OK)
8697                 return ext_res;
8698             }
8699           /* Extension parsing was successfull.  Confirm the result
8700              cpu and ISA flags.  */
8701           *res = cpu;
8702           *isa_flags = isa_temp;
8703           return AARCH64_PARSE_OK;
8704         }
8705     }
8706
8707   /* CPU name not found in list.  */
8708   return AARCH64_PARSE_INVALID_ARG;
8709 }
8710
8711 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8712    Return an aarch64_parse_opt_result describing the parse result.
8713    If the parsing fails the RES does not change.  */
8714
8715 static enum aarch64_parse_opt_result
8716 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8717 {
8718   const struct processor *cpu;
8719   char *str = (char *) alloca (strlen (to_parse) + 1);
8720
8721   strcpy (str, to_parse);
8722
8723   /* Loop through the list of supported CPUs to find a match.  */
8724   for (cpu = all_cores; cpu->name != NULL; cpu++)
8725     {
8726       if (strcmp (cpu->name, str) == 0)
8727         {
8728           *res = cpu;
8729           return AARCH64_PARSE_OK;
8730         }
8731     }
8732
8733   /* CPU name not found in list.  */
8734   return AARCH64_PARSE_INVALID_ARG;
8735 }
8736
8737 /* Parse TOKEN, which has length LENGTH to see if it is an option
8738    described in FLAG.  If it is, return the index bit for that fusion type.
8739    If not, error (printing OPTION_NAME) and return zero.  */
8740
8741 static unsigned int
8742 aarch64_parse_one_option_token (const char *token,
8743                                 size_t length,
8744                                 const struct aarch64_flag_desc *flag,
8745                                 const char *option_name)
8746 {
8747   for (; flag->name != NULL; flag++)
8748     {
8749       if (length == strlen (flag->name)
8750           && !strncmp (flag->name, token, length))
8751         return flag->flag;
8752     }
8753
8754   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8755   return 0;
8756 }
8757
8758 /* Parse OPTION which is a comma-separated list of flags to enable.
8759    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8760    default state we inherit from the CPU tuning structures.  OPTION_NAME
8761    gives the top-level option we are parsing in the -moverride string,
8762    for use in error messages.  */
8763
8764 static unsigned int
8765 aarch64_parse_boolean_options (const char *option,
8766                                const struct aarch64_flag_desc *flags,
8767                                unsigned int initial_state,
8768                                const char *option_name)
8769 {
8770   const char separator = '.';
8771   const char* specs = option;
8772   const char* ntoken = option;
8773   unsigned int found_flags = initial_state;
8774
8775   while ((ntoken = strchr (specs, separator)))
8776     {
8777       size_t token_length = ntoken - specs;
8778       unsigned token_ops = aarch64_parse_one_option_token (specs,
8779                                                            token_length,
8780                                                            flags,
8781                                                            option_name);
8782       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8783          in the token stream, reset the supported operations.  So:
8784
8785            adrp+add.cmp+branch.none.adrp+add
8786
8787            would have the result of turning on only adrp+add fusion.  */
8788       if (!token_ops)
8789         found_flags = 0;
8790
8791       found_flags |= token_ops;
8792       specs = ++ntoken;
8793     }
8794
8795   /* We ended with a comma, print something.  */
8796   if (!(*specs))
8797     {
8798       error ("%s string ill-formed\n", option_name);
8799       return 0;
8800     }
8801
8802   /* We still have one more token to parse.  */
8803   size_t token_length = strlen (specs);
8804   unsigned token_ops = aarch64_parse_one_option_token (specs,
8805                                                        token_length,
8806                                                        flags,
8807                                                        option_name);
8808    if (!token_ops)
8809      found_flags = 0;
8810
8811   found_flags |= token_ops;
8812   return found_flags;
8813 }
8814
8815 /* Support for overriding instruction fusion.  */
8816
8817 static void
8818 aarch64_parse_fuse_string (const char *fuse_string,
8819                             struct tune_params *tune)
8820 {
8821   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8822                                                      aarch64_fusible_pairs,
8823                                                      tune->fusible_ops,
8824                                                      "fuse=");
8825 }
8826
8827 /* Support for overriding other tuning flags.  */
8828
8829 static void
8830 aarch64_parse_tune_string (const char *tune_string,
8831                             struct tune_params *tune)
8832 {
8833   tune->extra_tuning_flags
8834     = aarch64_parse_boolean_options (tune_string,
8835                                      aarch64_tuning_flags,
8836                                      tune->extra_tuning_flags,
8837                                      "tune=");
8838 }
8839
8840 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8841    we understand.  If it is, extract the option string and handoff to
8842    the appropriate function.  */
8843
8844 void
8845 aarch64_parse_one_override_token (const char* token,
8846                                   size_t length,
8847                                   struct tune_params *tune)
8848 {
8849   const struct aarch64_tuning_override_function *fn
8850     = aarch64_tuning_override_functions;
8851
8852   const char *option_part = strchr (token, '=');
8853   if (!option_part)
8854     {
8855       error ("tuning string missing in option (%s)", token);
8856       return;
8857     }
8858
8859   /* Get the length of the option name.  */
8860   length = option_part - token;
8861   /* Skip the '=' to get to the option string.  */
8862   option_part++;
8863
8864   for (; fn->name != NULL; fn++)
8865     {
8866       if (!strncmp (fn->name, token, length))
8867         {
8868           fn->parse_override (option_part, tune);
8869           return;
8870         }
8871     }
8872
8873   error ("unknown tuning option (%s)",token);
8874   return;
8875 }
8876
8877 /* A checking mechanism for the implementation of the tls size.  */
8878
8879 static void
8880 initialize_aarch64_tls_size (struct gcc_options *opts)
8881 {
8882   if (aarch64_tls_size == 0)
8883     aarch64_tls_size = 24;
8884
8885   switch (opts->x_aarch64_cmodel_var)
8886     {
8887     case AARCH64_CMODEL_TINY:
8888       /* Both the default and maximum TLS size allowed under tiny is 1M which
8889          needs two instructions to address, so we clamp the size to 24.  */
8890       if (aarch64_tls_size > 24)
8891         aarch64_tls_size = 24;
8892       break;
8893     case AARCH64_CMODEL_SMALL:
8894       /* The maximum TLS size allowed under small is 4G.  */
8895       if (aarch64_tls_size > 32)
8896         aarch64_tls_size = 32;
8897       break;
8898     case AARCH64_CMODEL_LARGE:
8899       /* The maximum TLS size allowed under large is 16E.
8900          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8901       if (aarch64_tls_size > 48)
8902         aarch64_tls_size = 48;
8903       break;
8904     default:
8905       gcc_unreachable ();
8906     }
8907
8908   return;
8909 }
8910
8911 /* Parse STRING looking for options in the format:
8912      string     :: option:string
8913      option     :: name=substring
8914      name       :: {a-z}
8915      substring  :: defined by option.  */
8916
8917 static void
8918 aarch64_parse_override_string (const char* input_string,
8919                                struct tune_params* tune)
8920 {
8921   const char separator = ':';
8922   size_t string_length = strlen (input_string) + 1;
8923   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8924   char *string = string_root;
8925   strncpy (string, input_string, string_length);
8926   string[string_length - 1] = '\0';
8927
8928   char* ntoken = string;
8929
8930   while ((ntoken = strchr (string, separator)))
8931     {
8932       size_t token_length = ntoken - string;
8933       /* Make this substring look like a string.  */
8934       *ntoken = '\0';
8935       aarch64_parse_one_override_token (string, token_length, tune);
8936       string = ++ntoken;
8937     }
8938
8939   /* One last option to parse.  */
8940   aarch64_parse_one_override_token (string, strlen (string), tune);
8941   free (string_root);
8942 }
8943
8944
8945 static void
8946 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8947 {
8948   /* The logic here is that if we are disabling all frame pointer generation
8949      then we do not need to disable leaf frame pointer generation as a
8950      separate operation.  But if we are *only* disabling leaf frame pointer
8951      generation then we set flag_omit_frame_pointer to true, but in
8952      aarch64_frame_pointer_required we return false only for leaf functions.
8953
8954      PR 70044: We have to be careful about being called multiple times for the
8955      same function.  Once we have decided to set flag_omit_frame_pointer just
8956      so that we can omit leaf frame pointers, we must then not interpret a
8957      second call as meaning that all frame pointer generation should be
8958      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8959      non-zero value.  */
8960   if (opts->x_flag_omit_frame_pointer == 2)
8961     opts->x_flag_omit_frame_pointer = 0;
8962
8963   if (opts->x_flag_omit_frame_pointer)
8964     opts->x_flag_omit_leaf_frame_pointer = false;
8965   else if (opts->x_flag_omit_leaf_frame_pointer)
8966     opts->x_flag_omit_frame_pointer = 2;
8967
8968   /* If not optimizing for size, set the default
8969      alignment to what the target wants.  */
8970   if (!opts->x_optimize_size)
8971     {
8972       if (opts->x_align_loops <= 0)
8973         opts->x_align_loops = aarch64_tune_params.loop_align;
8974       if (opts->x_align_jumps <= 0)
8975         opts->x_align_jumps = aarch64_tune_params.jump_align;
8976       if (opts->x_align_functions <= 0)
8977         opts->x_align_functions = aarch64_tune_params.function_align;
8978     }
8979
8980   /* We default to no pc-relative literal loads.  */
8981
8982   aarch64_pcrelative_literal_loads = false;
8983
8984   /* If -mpc-relative-literal-loads is set on the command line, this
8985      implies that the user asked for PC relative literal loads.  */
8986   if (opts->x_pcrelative_literal_loads == 1)
8987     aarch64_pcrelative_literal_loads = true;
8988
8989   /* In the tiny memory model it makes no sense to disallow PC relative
8990      literal pool loads.  */
8991   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8992       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8993     aarch64_pcrelative_literal_loads = true;
8994
8995   /* When enabling the lower precision Newton series for the square root, also
8996      enable it for the reciprocal square root, since the latter is an
8997      intermediary step for the former.  */
8998   if (flag_mlow_precision_sqrt)
8999     flag_mrecip_low_precision_sqrt = true;
9000 }
9001
9002 /* 'Unpack' up the internal tuning structs and update the options
9003     in OPTS.  The caller must have set up selected_tune and selected_arch
9004     as all the other target-specific codegen decisions are
9005     derived from them.  */
9006
9007 void
9008 aarch64_override_options_internal (struct gcc_options *opts)
9009 {
9010   aarch64_tune_flags = selected_tune->flags;
9011   aarch64_tune = selected_tune->sched_core;
9012   /* Make a copy of the tuning parameters attached to the core, which
9013      we may later overwrite.  */
9014   aarch64_tune_params = *(selected_tune->tune);
9015   aarch64_architecture_version = selected_arch->architecture_version;
9016
9017   if (opts->x_aarch64_override_tune_string)
9018     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9019                                   &aarch64_tune_params);
9020
9021   /* This target defaults to strict volatile bitfields.  */
9022   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9023     opts->x_flag_strict_volatile_bitfields = 1;
9024
9025   initialize_aarch64_code_model (opts);
9026   initialize_aarch64_tls_size (opts);
9027
9028   int queue_depth = 0;
9029   switch (aarch64_tune_params.autoprefetcher_model)
9030     {
9031       case tune_params::AUTOPREFETCHER_OFF:
9032         queue_depth = -1;
9033         break;
9034       case tune_params::AUTOPREFETCHER_WEAK:
9035         queue_depth = 0;
9036         break;
9037       case tune_params::AUTOPREFETCHER_STRONG:
9038         queue_depth = max_insn_queue_index + 1;
9039         break;
9040       default:
9041         gcc_unreachable ();
9042     }
9043
9044   /* We don't mind passing in global_options_set here as we don't use
9045      the *options_set structs anyway.  */
9046   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9047                          queue_depth,
9048                          opts->x_param_values,
9049                          global_options_set.x_param_values);
9050
9051   /* Set up parameters to be used in prefetching algorithm.  Do not
9052      override the defaults unless we are tuning for a core we have
9053      researched values for.  */
9054   if (aarch64_tune_params.prefetch->num_slots > 0)
9055     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9056                            aarch64_tune_params.prefetch->num_slots,
9057                            opts->x_param_values,
9058                            global_options_set.x_param_values);
9059   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9060     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9061                            aarch64_tune_params.prefetch->l1_cache_size,
9062                            opts->x_param_values,
9063                            global_options_set.x_param_values);
9064   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9065     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9066                            aarch64_tune_params.prefetch->l1_cache_line_size,
9067                            opts->x_param_values,
9068                            global_options_set.x_param_values);
9069   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9070     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9071                            aarch64_tune_params.prefetch->l2_cache_size,
9072                            opts->x_param_values,
9073                            global_options_set.x_param_values);
9074
9075   /* Enable sw prefetching at specified optimization level for
9076      CPUS that have prefetch.  Lower optimization level threshold by 1
9077      when profiling is enabled.  */
9078   if (opts->x_flag_prefetch_loop_arrays < 0
9079       && !opts->x_optimize_size
9080       && aarch64_tune_params.prefetch->default_opt_level >= 0
9081       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9082     opts->x_flag_prefetch_loop_arrays = 1;
9083
9084   aarch64_override_options_after_change_1 (opts);
9085 }
9086
9087 /* Print a hint with a suggestion for a core or architecture name that
9088    most closely resembles what the user passed in STR.  ARCH is true if
9089    the user is asking for an architecture name.  ARCH is false if the user
9090    is asking for a core name.  */
9091
9092 static void
9093 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9094 {
9095   auto_vec<const char *> candidates;
9096   const struct processor *entry = arch ? all_architectures : all_cores;
9097   for (; entry->name != NULL; entry++)
9098     candidates.safe_push (entry->name);
9099   char *s;
9100   const char *hint = candidates_list_and_hint (str, s, candidates);
9101   if (hint)
9102     inform (input_location, "valid arguments are: %s;"
9103                              " did you mean %qs?", s, hint);
9104   XDELETEVEC (s);
9105 }
9106
9107 /* Print a hint with a suggestion for a core name that most closely resembles
9108    what the user passed in STR.  */
9109
9110 inline static void
9111 aarch64_print_hint_for_core (const char *str)
9112 {
9113   aarch64_print_hint_for_core_or_arch (str, false);
9114 }
9115
9116 /* Print a hint with a suggestion for an architecture name that most closely
9117    resembles what the user passed in STR.  */
9118
9119 inline static void
9120 aarch64_print_hint_for_arch (const char *str)
9121 {
9122   aarch64_print_hint_for_core_or_arch (str, true);
9123 }
9124
9125 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9126    specified in STR and throw errors if appropriate.  Put the results if
9127    they are valid in RES and ISA_FLAGS.  Return whether the option is
9128    valid.  */
9129
9130 static bool
9131 aarch64_validate_mcpu (const char *str, const struct processor **res,
9132                        unsigned long *isa_flags)
9133 {
9134   enum aarch64_parse_opt_result parse_res
9135     = aarch64_parse_cpu (str, res, isa_flags);
9136
9137   if (parse_res == AARCH64_PARSE_OK)
9138     return true;
9139
9140   switch (parse_res)
9141     {
9142       case AARCH64_PARSE_MISSING_ARG:
9143         error ("missing cpu name in %<-mcpu=%s%>", str);
9144         break;
9145       case AARCH64_PARSE_INVALID_ARG:
9146         error ("unknown value %qs for -mcpu", str);
9147         aarch64_print_hint_for_core (str);
9148         break;
9149       case AARCH64_PARSE_INVALID_FEATURE:
9150         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9151         break;
9152       default:
9153         gcc_unreachable ();
9154     }
9155
9156   return false;
9157 }
9158
9159 /* Validate a command-line -march option.  Parse the arch and extensions
9160    (if any) specified in STR and throw errors if appropriate.  Put the
9161    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9162    option is valid.  */
9163
9164 static bool
9165 aarch64_validate_march (const char *str, const struct processor **res,
9166                          unsigned long *isa_flags)
9167 {
9168   enum aarch64_parse_opt_result parse_res
9169     = aarch64_parse_arch (str, res, isa_flags);
9170
9171   if (parse_res == AARCH64_PARSE_OK)
9172     return true;
9173
9174   switch (parse_res)
9175     {
9176       case AARCH64_PARSE_MISSING_ARG:
9177         error ("missing arch name in %<-march=%s%>", str);
9178         break;
9179       case AARCH64_PARSE_INVALID_ARG:
9180         error ("unknown value %qs for -march", str);
9181         aarch64_print_hint_for_arch (str);
9182         break;
9183       case AARCH64_PARSE_INVALID_FEATURE:
9184         error ("invalid feature modifier in %<-march=%s%>", str);
9185         break;
9186       default:
9187         gcc_unreachable ();
9188     }
9189
9190   return false;
9191 }
9192
9193 /* Validate a command-line -mtune option.  Parse the cpu
9194    specified in STR and throw errors if appropriate.  Put the
9195    result, if it is valid, in RES.  Return whether the option is
9196    valid.  */
9197
9198 static bool
9199 aarch64_validate_mtune (const char *str, const struct processor **res)
9200 {
9201   enum aarch64_parse_opt_result parse_res
9202     = aarch64_parse_tune (str, res);
9203
9204   if (parse_res == AARCH64_PARSE_OK)
9205     return true;
9206
9207   switch (parse_res)
9208     {
9209       case AARCH64_PARSE_MISSING_ARG:
9210         error ("missing cpu name in %<-mtune=%s%>", str);
9211         break;
9212       case AARCH64_PARSE_INVALID_ARG:
9213         error ("unknown value %qs for -mtune", str);
9214         aarch64_print_hint_for_core (str);
9215         break;
9216       default:
9217         gcc_unreachable ();
9218     }
9219   return false;
9220 }
9221
9222 /* Return the CPU corresponding to the enum CPU.
9223    If it doesn't specify a cpu, return the default.  */
9224
9225 static const struct processor *
9226 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9227 {
9228   if (cpu != aarch64_none)
9229     return &all_cores[cpu];
9230
9231   /* The & 0x3f is to extract the bottom 6 bits that encode the
9232      default cpu as selected by the --with-cpu GCC configure option
9233      in config.gcc.
9234      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9235      flags mechanism should be reworked to make it more sane.  */
9236   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9237 }
9238
9239 /* Return the architecture corresponding to the enum ARCH.
9240    If it doesn't specify a valid architecture, return the default.  */
9241
9242 static const struct processor *
9243 aarch64_get_arch (enum aarch64_arch arch)
9244 {
9245   if (arch != aarch64_no_arch)
9246     return &all_architectures[arch];
9247
9248   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9249
9250   return &all_architectures[cpu->arch];
9251 }
9252
9253 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9254    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9255    tuning structs.  In particular it must set selected_tune and
9256    aarch64_isa_flags that define the available ISA features and tuning
9257    decisions.  It must also set selected_arch as this will be used to
9258    output the .arch asm tags for each function.  */
9259
9260 static void
9261 aarch64_override_options (void)
9262 {
9263   unsigned long cpu_isa = 0;
9264   unsigned long arch_isa = 0;
9265   aarch64_isa_flags = 0;
9266
9267   bool valid_cpu = true;
9268   bool valid_tune = true;
9269   bool valid_arch = true;
9270
9271   selected_cpu = NULL;
9272   selected_arch = NULL;
9273   selected_tune = NULL;
9274
9275   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9276      If either of -march or -mtune is given, they override their
9277      respective component of -mcpu.  */
9278   if (aarch64_cpu_string)
9279     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9280                                         &cpu_isa);
9281
9282   if (aarch64_arch_string)
9283     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9284                                           &arch_isa);
9285
9286   if (aarch64_tune_string)
9287     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9288
9289   /* If the user did not specify a processor, choose the default
9290      one for them.  This will be the CPU set during configuration using
9291      --with-cpu, otherwise it is "generic".  */
9292   if (!selected_cpu)
9293     {
9294       if (selected_arch)
9295         {
9296           selected_cpu = &all_cores[selected_arch->ident];
9297           aarch64_isa_flags = arch_isa;
9298           explicit_arch = selected_arch->arch;
9299         }
9300       else
9301         {
9302           /* Get default configure-time CPU.  */
9303           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9304           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9305         }
9306
9307       if (selected_tune)
9308         explicit_tune_core = selected_tune->ident;
9309     }
9310   /* If both -mcpu and -march are specified check that they are architecturally
9311      compatible, warn if they're not and prefer the -march ISA flags.  */
9312   else if (selected_arch)
9313     {
9314       if (selected_arch->arch != selected_cpu->arch)
9315         {
9316           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9317                        all_architectures[selected_cpu->arch].name,
9318                        selected_arch->name);
9319         }
9320       aarch64_isa_flags = arch_isa;
9321       explicit_arch = selected_arch->arch;
9322       explicit_tune_core = selected_tune ? selected_tune->ident
9323                                           : selected_cpu->ident;
9324     }
9325   else
9326     {
9327       /* -mcpu but no -march.  */
9328       aarch64_isa_flags = cpu_isa;
9329       explicit_tune_core = selected_tune ? selected_tune->ident
9330                                           : selected_cpu->ident;
9331       gcc_assert (selected_cpu);
9332       selected_arch = &all_architectures[selected_cpu->arch];
9333       explicit_arch = selected_arch->arch;
9334     }
9335
9336   /* Set the arch as well as we will need it when outputing
9337      the .arch directive in assembly.  */
9338   if (!selected_arch)
9339     {
9340       gcc_assert (selected_cpu);
9341       selected_arch = &all_architectures[selected_cpu->arch];
9342     }
9343
9344   if (!selected_tune)
9345     selected_tune = selected_cpu;
9346
9347 #ifndef HAVE_AS_MABI_OPTION
9348   /* The compiler may have been configured with 2.23.* binutils, which does
9349      not have support for ILP32.  */
9350   if (TARGET_ILP32)
9351     error ("Assembler does not support -mabi=ilp32");
9352 #endif
9353
9354   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9355     sorry ("Return address signing is only supported for -mabi=lp64");
9356
9357   /* Make sure we properly set up the explicit options.  */
9358   if ((aarch64_cpu_string && valid_cpu)
9359        || (aarch64_tune_string && valid_tune))
9360     gcc_assert (explicit_tune_core != aarch64_none);
9361
9362   if ((aarch64_cpu_string && valid_cpu)
9363        || (aarch64_arch_string && valid_arch))
9364     gcc_assert (explicit_arch != aarch64_no_arch);
9365
9366   aarch64_override_options_internal (&global_options);
9367
9368   /* Save these options as the default ones in case we push and pop them later
9369      while processing functions with potential target attributes.  */
9370   target_option_default_node = target_option_current_node
9371       = build_target_option_node (&global_options);
9372 }
9373
9374 /* Implement targetm.override_options_after_change.  */
9375
9376 static void
9377 aarch64_override_options_after_change (void)
9378 {
9379   aarch64_override_options_after_change_1 (&global_options);
9380 }
9381
9382 static struct machine_function *
9383 aarch64_init_machine_status (void)
9384 {
9385   struct machine_function *machine;
9386   machine = ggc_cleared_alloc<machine_function> ();
9387   return machine;
9388 }
9389
9390 void
9391 aarch64_init_expanders (void)
9392 {
9393   init_machine_status = aarch64_init_machine_status;
9394 }
9395
9396 /* A checking mechanism for the implementation of the various code models.  */
9397 static void
9398 initialize_aarch64_code_model (struct gcc_options *opts)
9399 {
9400    if (opts->x_flag_pic)
9401      {
9402        switch (opts->x_aarch64_cmodel_var)
9403          {
9404          case AARCH64_CMODEL_TINY:
9405            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9406            break;
9407          case AARCH64_CMODEL_SMALL:
9408 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9409            aarch64_cmodel = (flag_pic == 2
9410                              ? AARCH64_CMODEL_SMALL_PIC
9411                              : AARCH64_CMODEL_SMALL_SPIC);
9412 #else
9413            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9414 #endif
9415            break;
9416          case AARCH64_CMODEL_LARGE:
9417            sorry ("code model %qs with -f%s", "large",
9418                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9419            break;
9420          default:
9421            gcc_unreachable ();
9422          }
9423      }
9424    else
9425      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9426 }
9427
9428 /* Implement TARGET_OPTION_SAVE.  */
9429
9430 static void
9431 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9432 {
9433   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9434 }
9435
9436 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9437    using the information saved in PTR.  */
9438
9439 static void
9440 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9441 {
9442   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9443   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9444   opts->x_explicit_arch = ptr->x_explicit_arch;
9445   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9446   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9447
9448   aarch64_override_options_internal (opts);
9449 }
9450
9451 /* Implement TARGET_OPTION_PRINT.  */
9452
9453 static void
9454 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9455 {
9456   const struct processor *cpu
9457     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9458   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9459   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9460   std::string extension
9461     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9462
9463   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9464   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9465            arch->name, extension.c_str ());
9466 }
9467
9468 static GTY(()) tree aarch64_previous_fndecl;
9469
9470 void
9471 aarch64_reset_previous_fndecl (void)
9472 {
9473   aarch64_previous_fndecl = NULL;
9474 }
9475
9476 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9477    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9478    make sure optab availability predicates are recomputed when necessary.  */
9479
9480 void
9481 aarch64_save_restore_target_globals (tree new_tree)
9482 {
9483   if (TREE_TARGET_GLOBALS (new_tree))
9484     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9485   else if (new_tree == target_option_default_node)
9486     restore_target_globals (&default_target_globals);
9487   else
9488     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9489 }
9490
9491 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9492    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9493    of the function, if such exists.  This function may be called multiple
9494    times on a single function so use aarch64_previous_fndecl to avoid
9495    setting up identical state.  */
9496
9497 static void
9498 aarch64_set_current_function (tree fndecl)
9499 {
9500   if (!fndecl || fndecl == aarch64_previous_fndecl)
9501     return;
9502
9503   tree old_tree = (aarch64_previous_fndecl
9504                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9505                    : NULL_TREE);
9506
9507   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9508
9509   /* If current function has no attributes but the previous one did,
9510      use the default node.  */
9511   if (!new_tree && old_tree)
9512     new_tree = target_option_default_node;
9513
9514   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9515      the default have been handled by aarch64_save_restore_target_globals from
9516      aarch64_pragma_target_parse.  */
9517   if (old_tree == new_tree)
9518     return;
9519
9520   aarch64_previous_fndecl = fndecl;
9521
9522   /* First set the target options.  */
9523   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9524
9525   aarch64_save_restore_target_globals (new_tree);
9526 }
9527
9528 /* Enum describing the various ways we can handle attributes.
9529    In many cases we can reuse the generic option handling machinery.  */
9530
9531 enum aarch64_attr_opt_type
9532 {
9533   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9534   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9535   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9536   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9537 };
9538
9539 /* All the information needed to handle a target attribute.
9540    NAME is the name of the attribute.
9541    ATTR_TYPE specifies the type of behavior of the attribute as described
9542    in the definition of enum aarch64_attr_opt_type.
9543    ALLOW_NEG is true if the attribute supports a "no-" form.
9544    HANDLER is the function that takes the attribute string and whether
9545    it is a pragma or attribute and handles the option.  It is needed only
9546    when the ATTR_TYPE is aarch64_attr_custom.
9547    OPT_NUM is the enum specifying the option that the attribute modifies.
9548    This is needed for attributes that mirror the behavior of a command-line
9549    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9550    aarch64_attr_enum.  */
9551
9552 struct aarch64_attribute_info
9553 {
9554   const char *name;
9555   enum aarch64_attr_opt_type attr_type;
9556   bool allow_neg;
9557   bool (*handler) (const char *, const char *);
9558   enum opt_code opt_num;
9559 };
9560
9561 /* Handle the ARCH_STR argument to the arch= target attribute.
9562    PRAGMA_OR_ATTR is used in potential error messages.  */
9563
9564 static bool
9565 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9566 {
9567   const struct processor *tmp_arch = NULL;
9568   enum aarch64_parse_opt_result parse_res
9569     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9570
9571   if (parse_res == AARCH64_PARSE_OK)
9572     {
9573       gcc_assert (tmp_arch);
9574       selected_arch = tmp_arch;
9575       explicit_arch = selected_arch->arch;
9576       return true;
9577     }
9578
9579   switch (parse_res)
9580     {
9581       case AARCH64_PARSE_MISSING_ARG:
9582         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9583         break;
9584       case AARCH64_PARSE_INVALID_ARG:
9585         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9586         aarch64_print_hint_for_arch (str);
9587         break;
9588       case AARCH64_PARSE_INVALID_FEATURE:
9589         error ("invalid feature modifier %qs for 'arch' target %s",
9590                str, pragma_or_attr);
9591         break;
9592       default:
9593         gcc_unreachable ();
9594     }
9595
9596   return false;
9597 }
9598
9599 /* Handle the argument CPU_STR to the cpu= target attribute.
9600    PRAGMA_OR_ATTR is used in potential error messages.  */
9601
9602 static bool
9603 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9604 {
9605   const struct processor *tmp_cpu = NULL;
9606   enum aarch64_parse_opt_result parse_res
9607     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9608
9609   if (parse_res == AARCH64_PARSE_OK)
9610     {
9611       gcc_assert (tmp_cpu);
9612       selected_tune = tmp_cpu;
9613       explicit_tune_core = selected_tune->ident;
9614
9615       selected_arch = &all_architectures[tmp_cpu->arch];
9616       explicit_arch = selected_arch->arch;
9617       return true;
9618     }
9619
9620   switch (parse_res)
9621     {
9622       case AARCH64_PARSE_MISSING_ARG:
9623         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9624         break;
9625       case AARCH64_PARSE_INVALID_ARG:
9626         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9627         aarch64_print_hint_for_core (str);
9628         break;
9629       case AARCH64_PARSE_INVALID_FEATURE:
9630         error ("invalid feature modifier %qs for 'cpu' target %s",
9631                str, pragma_or_attr);
9632         break;
9633       default:
9634         gcc_unreachable ();
9635     }
9636
9637   return false;
9638 }
9639
9640 /* Handle the argument STR to the tune= target attribute.
9641    PRAGMA_OR_ATTR is used in potential error messages.  */
9642
9643 static bool
9644 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9645 {
9646   const struct processor *tmp_tune = NULL;
9647   enum aarch64_parse_opt_result parse_res
9648     = aarch64_parse_tune (str, &tmp_tune);
9649
9650   if (parse_res == AARCH64_PARSE_OK)
9651     {
9652       gcc_assert (tmp_tune);
9653       selected_tune = tmp_tune;
9654       explicit_tune_core = selected_tune->ident;
9655       return true;
9656     }
9657
9658   switch (parse_res)
9659     {
9660       case AARCH64_PARSE_INVALID_ARG:
9661         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9662         aarch64_print_hint_for_core (str);
9663         break;
9664       default:
9665         gcc_unreachable ();
9666     }
9667
9668   return false;
9669 }
9670
9671 /* Parse an architecture extensions target attribute string specified in STR.
9672    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9673    if successful.  Update aarch64_isa_flags to reflect the ISA features
9674    modified.
9675    PRAGMA_OR_ATTR is used in potential error messages.  */
9676
9677 static bool
9678 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9679 {
9680   enum aarch64_parse_opt_result parse_res;
9681   unsigned long isa_flags = aarch64_isa_flags;
9682
9683   /* We allow "+nothing" in the beginning to clear out all architectural
9684      features if the user wants to handpick specific features.  */
9685   if (strncmp ("+nothing", str, 8) == 0)
9686     {
9687       isa_flags = 0;
9688       str += 8;
9689     }
9690
9691   parse_res = aarch64_parse_extension (str, &isa_flags);
9692
9693   if (parse_res == AARCH64_PARSE_OK)
9694     {
9695       aarch64_isa_flags = isa_flags;
9696       return true;
9697     }
9698
9699   switch (parse_res)
9700     {
9701       case AARCH64_PARSE_MISSING_ARG:
9702         error ("missing feature modifier in target %s %qs",
9703                pragma_or_attr, str);
9704         break;
9705
9706       case AARCH64_PARSE_INVALID_FEATURE:
9707         error ("invalid feature modifier in target %s %qs",
9708                pragma_or_attr, str);
9709         break;
9710
9711       default:
9712         gcc_unreachable ();
9713     }
9714
9715  return false;
9716 }
9717
9718 /* The target attributes that we support.  On top of these we also support just
9719    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9720    handled explicitly in aarch64_process_one_target_attr.  */
9721
9722 static const struct aarch64_attribute_info aarch64_attributes[] =
9723 {
9724   { "general-regs-only", aarch64_attr_mask, false, NULL,
9725      OPT_mgeneral_regs_only },
9726   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9727      OPT_mfix_cortex_a53_835769 },
9728   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9729      OPT_mfix_cortex_a53_843419 },
9730   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9731   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9732   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9733      OPT_momit_leaf_frame_pointer },
9734   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9735   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9736      OPT_march_ },
9737   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9738   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9739      OPT_mtune_ },
9740   { "sign-return-address", aarch64_attr_enum, false, NULL,
9741      OPT_msign_return_address_ },
9742   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9743 };
9744
9745 /* Parse ARG_STR which contains the definition of one target attribute.
9746    Show appropriate errors if any or return true if the attribute is valid.
9747    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9748    we're processing a target attribute or pragma.  */
9749
9750 static bool
9751 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9752 {
9753   bool invert = false;
9754
9755   size_t len = strlen (arg_str);
9756
9757   if (len == 0)
9758     {
9759       error ("malformed target %s", pragma_or_attr);
9760       return false;
9761     }
9762
9763   char *str_to_check = (char *) alloca (len + 1);
9764   strcpy (str_to_check, arg_str);
9765
9766   /* Skip leading whitespace.  */
9767   while (*str_to_check == ' ' || *str_to_check == '\t')
9768     str_to_check++;
9769
9770   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9771      It is easier to detect and handle it explicitly here rather than going
9772      through the machinery for the rest of the target attributes in this
9773      function.  */
9774   if (*str_to_check == '+')
9775     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9776
9777   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9778     {
9779       invert = true;
9780       str_to_check += 3;
9781     }
9782   char *arg = strchr (str_to_check, '=');
9783
9784   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9785      and point ARG to "foo".  */
9786   if (arg)
9787     {
9788       *arg = '\0';
9789       arg++;
9790     }
9791   const struct aarch64_attribute_info *p_attr;
9792   bool found = false;
9793   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9794     {
9795       /* If the names don't match up, or the user has given an argument
9796          to an attribute that doesn't accept one, or didn't give an argument
9797          to an attribute that expects one, fail to match.  */
9798       if (strcmp (str_to_check, p_attr->name) != 0)
9799         continue;
9800
9801       found = true;
9802       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9803                               || p_attr->attr_type == aarch64_attr_enum;
9804
9805       if (attr_need_arg_p ^ (arg != NULL))
9806         {
9807           error ("target %s %qs does not accept an argument",
9808                   pragma_or_attr, str_to_check);
9809           return false;
9810         }
9811
9812       /* If the name matches but the attribute does not allow "no-" versions
9813          then we can't match.  */
9814       if (invert && !p_attr->allow_neg)
9815         {
9816           error ("target %s %qs does not allow a negated form",
9817                   pragma_or_attr, str_to_check);
9818           return false;
9819         }
9820
9821       switch (p_attr->attr_type)
9822         {
9823         /* Has a custom handler registered.
9824            For example, cpu=, arch=, tune=.  */
9825           case aarch64_attr_custom:
9826             gcc_assert (p_attr->handler);
9827             if (!p_attr->handler (arg, pragma_or_attr))
9828               return false;
9829             break;
9830
9831           /* Either set or unset a boolean option.  */
9832           case aarch64_attr_bool:
9833             {
9834               struct cl_decoded_option decoded;
9835
9836               generate_option (p_attr->opt_num, NULL, !invert,
9837                                CL_TARGET, &decoded);
9838               aarch64_handle_option (&global_options, &global_options_set,
9839                                       &decoded, input_location);
9840               break;
9841             }
9842           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9843              should know what mask to apply given the option number.  */
9844           case aarch64_attr_mask:
9845             {
9846               struct cl_decoded_option decoded;
9847               /* We only need to specify the option number.
9848                  aarch64_handle_option will know which mask to apply.  */
9849               decoded.opt_index = p_attr->opt_num;
9850               decoded.value = !invert;
9851               aarch64_handle_option (&global_options, &global_options_set,
9852                                       &decoded, input_location);
9853               break;
9854             }
9855           /* Use the option setting machinery to set an option to an enum.  */
9856           case aarch64_attr_enum:
9857             {
9858               gcc_assert (arg);
9859               bool valid;
9860               int value;
9861               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9862                                               &value, CL_TARGET);
9863               if (valid)
9864                 {
9865                   set_option (&global_options, NULL, p_attr->opt_num, value,
9866                               NULL, DK_UNSPECIFIED, input_location,
9867                               global_dc);
9868                 }
9869               else
9870                 {
9871                   error ("target %s %s=%s is not valid",
9872                          pragma_or_attr, str_to_check, arg);
9873                 }
9874               break;
9875             }
9876           default:
9877             gcc_unreachable ();
9878         }
9879     }
9880
9881   /* If we reached here we either have found an attribute and validated
9882      it or didn't match any.  If we matched an attribute but its arguments
9883      were malformed we will have returned false already.  */
9884   return found;
9885 }
9886
9887 /* Count how many times the character C appears in
9888    NULL-terminated string STR.  */
9889
9890 static unsigned int
9891 num_occurences_in_str (char c, char *str)
9892 {
9893   unsigned int res = 0;
9894   while (*str != '\0')
9895     {
9896       if (*str == c)
9897         res++;
9898
9899       str++;
9900     }
9901
9902   return res;
9903 }
9904
9905 /* Parse the tree in ARGS that contains the target attribute information
9906    and update the global target options space.  PRAGMA_OR_ATTR is a string
9907    to be used in error messages, specifying whether this is processing
9908    a target attribute or a target pragma.  */
9909
9910 bool
9911 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9912 {
9913   if (TREE_CODE (args) == TREE_LIST)
9914     {
9915       do
9916         {
9917           tree head = TREE_VALUE (args);
9918           if (head)
9919             {
9920               if (!aarch64_process_target_attr (head, pragma_or_attr))
9921                 return false;
9922             }
9923           args = TREE_CHAIN (args);
9924         } while (args);
9925
9926       return true;
9927     }
9928
9929   if (TREE_CODE (args) != STRING_CST)
9930     {
9931       error ("attribute %<target%> argument not a string");
9932       return false;
9933     }
9934
9935   size_t len = strlen (TREE_STRING_POINTER (args));
9936   char *str_to_check = (char *) alloca (len + 1);
9937   strcpy (str_to_check, TREE_STRING_POINTER (args));
9938
9939   if (len == 0)
9940     {
9941       error ("malformed target %s value", pragma_or_attr);
9942       return false;
9943     }
9944
9945   /* Used to catch empty spaces between commas i.e.
9946      attribute ((target ("attr1,,attr2"))).  */
9947   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9948
9949   /* Handle multiple target attributes separated by ','.  */
9950   char *token = strtok (str_to_check, ",");
9951
9952   unsigned int num_attrs = 0;
9953   while (token)
9954     {
9955       num_attrs++;
9956       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9957         {
9958           error ("target %s %qs is invalid", pragma_or_attr, token);
9959           return false;
9960         }
9961
9962       token = strtok (NULL, ",");
9963     }
9964
9965   if (num_attrs != num_commas + 1)
9966     {
9967       error ("malformed target %s list %qs",
9968               pragma_or_attr, TREE_STRING_POINTER (args));
9969       return false;
9970     }
9971
9972   return true;
9973 }
9974
9975 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9976    process attribute ((target ("..."))).  */
9977
9978 static bool
9979 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9980 {
9981   struct cl_target_option cur_target;
9982   bool ret;
9983   tree old_optimize;
9984   tree new_target, new_optimize;
9985   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9986
9987   /* If what we're processing is the current pragma string then the
9988      target option node is already stored in target_option_current_node
9989      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9990      having to re-parse the string.  This is especially useful to keep
9991      arm_neon.h compile times down since that header contains a lot
9992      of intrinsics enclosed in pragmas.  */
9993   if (!existing_target && args == current_target_pragma)
9994     {
9995       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9996       return true;
9997     }
9998   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9999
10000   old_optimize = build_optimization_node (&global_options);
10001   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10002
10003   /* If the function changed the optimization levels as well as setting
10004      target options, start with the optimizations specified.  */
10005   if (func_optimize && func_optimize != old_optimize)
10006     cl_optimization_restore (&global_options,
10007                              TREE_OPTIMIZATION (func_optimize));
10008
10009   /* Save the current target options to restore at the end.  */
10010   cl_target_option_save (&cur_target, &global_options);
10011
10012   /* If fndecl already has some target attributes applied to it, unpack
10013      them so that we add this attribute on top of them, rather than
10014      overwriting them.  */
10015   if (existing_target)
10016     {
10017       struct cl_target_option *existing_options
10018         = TREE_TARGET_OPTION (existing_target);
10019
10020       if (existing_options)
10021         cl_target_option_restore (&global_options, existing_options);
10022     }
10023   else
10024     cl_target_option_restore (&global_options,
10025                         TREE_TARGET_OPTION (target_option_current_node));
10026
10027
10028   ret = aarch64_process_target_attr (args, "attribute");
10029
10030   /* Set up any additional state.  */
10031   if (ret)
10032     {
10033       aarch64_override_options_internal (&global_options);
10034       /* Initialize SIMD builtins if we haven't already.
10035          Set current_target_pragma to NULL for the duration so that
10036          the builtin initialization code doesn't try to tag the functions
10037          being built with the attributes specified by any current pragma, thus
10038          going into an infinite recursion.  */
10039       if (TARGET_SIMD)
10040         {
10041           tree saved_current_target_pragma = current_target_pragma;
10042           current_target_pragma = NULL;
10043           aarch64_init_simd_builtins ();
10044           current_target_pragma = saved_current_target_pragma;
10045         }
10046       new_target = build_target_option_node (&global_options);
10047     }
10048   else
10049     new_target = NULL;
10050
10051   new_optimize = build_optimization_node (&global_options);
10052
10053   if (fndecl && ret)
10054     {
10055       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10056
10057       if (old_optimize != new_optimize)
10058         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10059     }
10060
10061   cl_target_option_restore (&global_options, &cur_target);
10062
10063   if (old_optimize != new_optimize)
10064     cl_optimization_restore (&global_options,
10065                              TREE_OPTIMIZATION (old_optimize));
10066   return ret;
10067 }
10068
10069 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10070    tri-bool options (yes, no, don't care) and the default value is
10071    DEF, determine whether to reject inlining.  */
10072
10073 static bool
10074 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10075                                      int dont_care, int def)
10076 {
10077   /* If the callee doesn't care, always allow inlining.  */
10078   if (callee == dont_care)
10079     return true;
10080
10081   /* If the caller doesn't care, always allow inlining.  */
10082   if (caller == dont_care)
10083     return true;
10084
10085   /* Otherwise, allow inlining if either the callee and caller values
10086      agree, or if the callee is using the default value.  */
10087   return (callee == caller || callee == def);
10088 }
10089
10090 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10091    to inline CALLEE into CALLER based on target-specific info.
10092    Make sure that the caller and callee have compatible architectural
10093    features.  Then go through the other possible target attributes
10094    and see if they can block inlining.  Try not to reject always_inline
10095    callees unless they are incompatible architecturally.  */
10096
10097 static bool
10098 aarch64_can_inline_p (tree caller, tree callee)
10099 {
10100   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10101   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10102
10103   /* If callee has no option attributes, then it is ok to inline.  */
10104   if (!callee_tree)
10105     return true;
10106
10107   struct cl_target_option *caller_opts
10108         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10109                                            : target_option_default_node);
10110
10111   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10112
10113
10114   /* Callee's ISA flags should be a subset of the caller's.  */
10115   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10116        != callee_opts->x_aarch64_isa_flags)
10117     return false;
10118
10119   /* Allow non-strict aligned functions inlining into strict
10120      aligned ones.  */
10121   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10122        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10123       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10124            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10125     return false;
10126
10127   bool always_inline = lookup_attribute ("always_inline",
10128                                           DECL_ATTRIBUTES (callee));
10129
10130   /* If the architectural features match up and the callee is always_inline
10131      then the other attributes don't matter.  */
10132   if (always_inline)
10133     return true;
10134
10135   if (caller_opts->x_aarch64_cmodel_var
10136       != callee_opts->x_aarch64_cmodel_var)
10137     return false;
10138
10139   if (caller_opts->x_aarch64_tls_dialect
10140       != callee_opts->x_aarch64_tls_dialect)
10141     return false;
10142
10143   /* Honour explicit requests to workaround errata.  */
10144   if (!aarch64_tribools_ok_for_inlining_p (
10145           caller_opts->x_aarch64_fix_a53_err835769,
10146           callee_opts->x_aarch64_fix_a53_err835769,
10147           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10148     return false;
10149
10150   if (!aarch64_tribools_ok_for_inlining_p (
10151           caller_opts->x_aarch64_fix_a53_err843419,
10152           callee_opts->x_aarch64_fix_a53_err843419,
10153           2, TARGET_FIX_ERR_A53_843419))
10154     return false;
10155
10156   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10157      caller and calle and they don't match up, reject inlining.  */
10158   if (!aarch64_tribools_ok_for_inlining_p (
10159           caller_opts->x_flag_omit_leaf_frame_pointer,
10160           callee_opts->x_flag_omit_leaf_frame_pointer,
10161           2, 1))
10162     return false;
10163
10164   /* If the callee has specific tuning overrides, respect them.  */
10165   if (callee_opts->x_aarch64_override_tune_string != NULL
10166       && caller_opts->x_aarch64_override_tune_string == NULL)
10167     return false;
10168
10169   /* If the user specified tuning override strings for the
10170      caller and callee and they don't match up, reject inlining.
10171      We just do a string compare here, we don't analyze the meaning
10172      of the string, as it would be too costly for little gain.  */
10173   if (callee_opts->x_aarch64_override_tune_string
10174       && caller_opts->x_aarch64_override_tune_string
10175       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10176                   caller_opts->x_aarch64_override_tune_string) != 0))
10177     return false;
10178
10179   return true;
10180 }
10181
10182 /* Return true if SYMBOL_REF X binds locally.  */
10183
10184 static bool
10185 aarch64_symbol_binds_local_p (const_rtx x)
10186 {
10187   return (SYMBOL_REF_DECL (x)
10188           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10189           : SYMBOL_REF_LOCAL_P (x));
10190 }
10191
10192 /* Return true if SYMBOL_REF X is thread local */
10193 static bool
10194 aarch64_tls_symbol_p (rtx x)
10195 {
10196   if (! TARGET_HAVE_TLS)
10197     return false;
10198
10199   if (GET_CODE (x) != SYMBOL_REF)
10200     return false;
10201
10202   return SYMBOL_REF_TLS_MODEL (x) != 0;
10203 }
10204
10205 /* Classify a TLS symbol into one of the TLS kinds.  */
10206 enum aarch64_symbol_type
10207 aarch64_classify_tls_symbol (rtx x)
10208 {
10209   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10210
10211   switch (tls_kind)
10212     {
10213     case TLS_MODEL_GLOBAL_DYNAMIC:
10214     case TLS_MODEL_LOCAL_DYNAMIC:
10215       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10216
10217     case TLS_MODEL_INITIAL_EXEC:
10218       switch (aarch64_cmodel)
10219         {
10220         case AARCH64_CMODEL_TINY:
10221         case AARCH64_CMODEL_TINY_PIC:
10222           return SYMBOL_TINY_TLSIE;
10223         default:
10224           return SYMBOL_SMALL_TLSIE;
10225         }
10226
10227     case TLS_MODEL_LOCAL_EXEC:
10228       if (aarch64_tls_size == 12)
10229         return SYMBOL_TLSLE12;
10230       else if (aarch64_tls_size == 24)
10231         return SYMBOL_TLSLE24;
10232       else if (aarch64_tls_size == 32)
10233         return SYMBOL_TLSLE32;
10234       else if (aarch64_tls_size == 48)
10235         return SYMBOL_TLSLE48;
10236       else
10237         gcc_unreachable ();
10238
10239     case TLS_MODEL_EMULATED:
10240     case TLS_MODEL_NONE:
10241       return SYMBOL_FORCE_TO_MEM;
10242
10243     default:
10244       gcc_unreachable ();
10245     }
10246 }
10247
10248 /* Return the method that should be used to access SYMBOL_REF or
10249    LABEL_REF X.  */
10250
10251 enum aarch64_symbol_type
10252 aarch64_classify_symbol (rtx x, rtx offset)
10253 {
10254   if (GET_CODE (x) == LABEL_REF)
10255     {
10256       switch (aarch64_cmodel)
10257         {
10258         case AARCH64_CMODEL_LARGE:
10259           return SYMBOL_FORCE_TO_MEM;
10260
10261         case AARCH64_CMODEL_TINY_PIC:
10262         case AARCH64_CMODEL_TINY:
10263           return SYMBOL_TINY_ABSOLUTE;
10264
10265         case AARCH64_CMODEL_SMALL_SPIC:
10266         case AARCH64_CMODEL_SMALL_PIC:
10267         case AARCH64_CMODEL_SMALL:
10268           return SYMBOL_SMALL_ABSOLUTE;
10269
10270         default:
10271           gcc_unreachable ();
10272         }
10273     }
10274
10275   if (GET_CODE (x) == SYMBOL_REF)
10276     {
10277       if (aarch64_tls_symbol_p (x))
10278         return aarch64_classify_tls_symbol (x);
10279
10280       switch (aarch64_cmodel)
10281         {
10282         case AARCH64_CMODEL_TINY:
10283           /* When we retrieve symbol + offset address, we have to make sure
10284              the offset does not cause overflow of the final address.  But
10285              we have no way of knowing the address of symbol at compile time
10286              so we can't accurately say if the distance between the PC and
10287              symbol + offset is outside the addressible range of +/-1M in the
10288              TINY code model.  So we rely on images not being greater than
10289              1M and cap the offset at 1M and anything beyond 1M will have to
10290              be loaded using an alternative mechanism.  Furthermore if the
10291              symbol is a weak reference to something that isn't known to
10292              resolve to a symbol in this module, then force to memory.  */
10293           if ((SYMBOL_REF_WEAK (x)
10294                && !aarch64_symbol_binds_local_p (x))
10295               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10296             return SYMBOL_FORCE_TO_MEM;
10297           return SYMBOL_TINY_ABSOLUTE;
10298
10299         case AARCH64_CMODEL_SMALL:
10300           /* Same reasoning as the tiny code model, but the offset cap here is
10301              4G.  */
10302           if ((SYMBOL_REF_WEAK (x)
10303                && !aarch64_symbol_binds_local_p (x))
10304               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10305                             HOST_WIDE_INT_C (4294967264)))
10306             return SYMBOL_FORCE_TO_MEM;
10307           return SYMBOL_SMALL_ABSOLUTE;
10308
10309         case AARCH64_CMODEL_TINY_PIC:
10310           if (!aarch64_symbol_binds_local_p (x))
10311             return SYMBOL_TINY_GOT;
10312           return SYMBOL_TINY_ABSOLUTE;
10313
10314         case AARCH64_CMODEL_SMALL_SPIC:
10315         case AARCH64_CMODEL_SMALL_PIC:
10316           if (!aarch64_symbol_binds_local_p (x))
10317             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10318                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10319           return SYMBOL_SMALL_ABSOLUTE;
10320
10321         case AARCH64_CMODEL_LARGE:
10322           /* This is alright even in PIC code as the constant
10323              pool reference is always PC relative and within
10324              the same translation unit.  */
10325           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10326             return SYMBOL_SMALL_ABSOLUTE;
10327           else
10328             return SYMBOL_FORCE_TO_MEM;
10329
10330         default:
10331           gcc_unreachable ();
10332         }
10333     }
10334
10335   /* By default push everything into the constant pool.  */
10336   return SYMBOL_FORCE_TO_MEM;
10337 }
10338
10339 bool
10340 aarch64_constant_address_p (rtx x)
10341 {
10342   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10343 }
10344
10345 bool
10346 aarch64_legitimate_pic_operand_p (rtx x)
10347 {
10348   if (GET_CODE (x) == SYMBOL_REF
10349       || (GET_CODE (x) == CONST
10350           && GET_CODE (XEXP (x, 0)) == PLUS
10351           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10352      return false;
10353
10354   return true;
10355 }
10356
10357 /* Return true if X holds either a quarter-precision or
10358      floating-point +0.0 constant.  */
10359 static bool
10360 aarch64_valid_floating_const (rtx x)
10361 {
10362   if (!CONST_DOUBLE_P (x))
10363     return false;
10364
10365   /* This call determines which constants can be used in mov<mode>
10366      as integer moves instead of constant loads.  */
10367   if (aarch64_float_const_rtx_p (x))
10368     return true;
10369
10370   return aarch64_float_const_representable_p (x);
10371 }
10372
10373 static bool
10374 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10375 {
10376   /* Do not allow vector struct mode constants.  We could support
10377      0 and -1 easily, but they need support in aarch64-simd.md.  */
10378   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10379     return false;
10380
10381   /* For these cases we never want to use a literal load.
10382      As such we have to prevent the compiler from forcing these
10383      to memory.  */
10384   if ((GET_CODE (x) == CONST_VECTOR
10385        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10386       || CONST_INT_P (x)
10387       || aarch64_valid_floating_const (x)
10388       || aarch64_can_const_movi_rtx_p (x, mode)
10389       || aarch64_float_const_rtx_p (x))
10390         return !targetm.cannot_force_const_mem (mode, x);
10391
10392   if (GET_CODE (x) == HIGH
10393       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10394     return true;
10395
10396   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10397      so spilling them is better than rematerialization.  */
10398   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10399     return true;
10400
10401   return aarch64_constant_address_p (x);
10402 }
10403
10404 rtx
10405 aarch64_load_tp (rtx target)
10406 {
10407   if (!target
10408       || GET_MODE (target) != Pmode
10409       || !register_operand (target, Pmode))
10410     target = gen_reg_rtx (Pmode);
10411
10412   /* Can return in any reg.  */
10413   emit_insn (gen_aarch64_load_tp_hard (target));
10414   return target;
10415 }
10416
10417 /* On AAPCS systems, this is the "struct __va_list".  */
10418 static GTY(()) tree va_list_type;
10419
10420 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10421    Return the type to use as __builtin_va_list.
10422
10423    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10424
10425    struct __va_list
10426    {
10427      void *__stack;
10428      void *__gr_top;
10429      void *__vr_top;
10430      int   __gr_offs;
10431      int   __vr_offs;
10432    };  */
10433
10434 static tree
10435 aarch64_build_builtin_va_list (void)
10436 {
10437   tree va_list_name;
10438   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10439
10440   /* Create the type.  */
10441   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10442   /* Give it the required name.  */
10443   va_list_name = build_decl (BUILTINS_LOCATION,
10444                              TYPE_DECL,
10445                              get_identifier ("__va_list"),
10446                              va_list_type);
10447   DECL_ARTIFICIAL (va_list_name) = 1;
10448   TYPE_NAME (va_list_type) = va_list_name;
10449   TYPE_STUB_DECL (va_list_type) = va_list_name;
10450
10451   /* Create the fields.  */
10452   f_stack = build_decl (BUILTINS_LOCATION,
10453                         FIELD_DECL, get_identifier ("__stack"),
10454                         ptr_type_node);
10455   f_grtop = build_decl (BUILTINS_LOCATION,
10456                         FIELD_DECL, get_identifier ("__gr_top"),
10457                         ptr_type_node);
10458   f_vrtop = build_decl (BUILTINS_LOCATION,
10459                         FIELD_DECL, get_identifier ("__vr_top"),
10460                         ptr_type_node);
10461   f_groff = build_decl (BUILTINS_LOCATION,
10462                         FIELD_DECL, get_identifier ("__gr_offs"),
10463                         integer_type_node);
10464   f_vroff = build_decl (BUILTINS_LOCATION,
10465                         FIELD_DECL, get_identifier ("__vr_offs"),
10466                         integer_type_node);
10467
10468   /* Tell tree-stdarg pass about our internal offset fields.
10469      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10470      purpose to identify whether the code is updating va_list internal
10471      offset fields through irregular way.  */
10472   va_list_gpr_counter_field = f_groff;
10473   va_list_fpr_counter_field = f_vroff;
10474
10475   DECL_ARTIFICIAL (f_stack) = 1;
10476   DECL_ARTIFICIAL (f_grtop) = 1;
10477   DECL_ARTIFICIAL (f_vrtop) = 1;
10478   DECL_ARTIFICIAL (f_groff) = 1;
10479   DECL_ARTIFICIAL (f_vroff) = 1;
10480
10481   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10482   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10483   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10484   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10485   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10486
10487   TYPE_FIELDS (va_list_type) = f_stack;
10488   DECL_CHAIN (f_stack) = f_grtop;
10489   DECL_CHAIN (f_grtop) = f_vrtop;
10490   DECL_CHAIN (f_vrtop) = f_groff;
10491   DECL_CHAIN (f_groff) = f_vroff;
10492
10493   /* Compute its layout.  */
10494   layout_type (va_list_type);
10495
10496   return va_list_type;
10497 }
10498
10499 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10500 static void
10501 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10502 {
10503   const CUMULATIVE_ARGS *cum;
10504   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10505   tree stack, grtop, vrtop, groff, vroff;
10506   tree t;
10507   int gr_save_area_size = cfun->va_list_gpr_size;
10508   int vr_save_area_size = cfun->va_list_fpr_size;
10509   int vr_offset;
10510
10511   cum = &crtl->args.info;
10512   if (cfun->va_list_gpr_size)
10513     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10514                              cfun->va_list_gpr_size);
10515   if (cfun->va_list_fpr_size)
10516     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10517                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10518
10519   if (!TARGET_FLOAT)
10520     {
10521       gcc_assert (cum->aapcs_nvrn == 0);
10522       vr_save_area_size = 0;
10523     }
10524
10525   f_stack = TYPE_FIELDS (va_list_type_node);
10526   f_grtop = DECL_CHAIN (f_stack);
10527   f_vrtop = DECL_CHAIN (f_grtop);
10528   f_groff = DECL_CHAIN (f_vrtop);
10529   f_vroff = DECL_CHAIN (f_groff);
10530
10531   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10532                   NULL_TREE);
10533   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10534                   NULL_TREE);
10535   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10536                   NULL_TREE);
10537   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10538                   NULL_TREE);
10539   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10540                   NULL_TREE);
10541
10542   /* Emit code to initialize STACK, which points to the next varargs stack
10543      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10544      by named arguments.  STACK is 8-byte aligned.  */
10545   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10546   if (cum->aapcs_stack_size > 0)
10547     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10548   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10549   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10550
10551   /* Emit code to initialize GRTOP, the top of the GR save area.
10552      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10553   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10554   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10555   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10556
10557   /* Emit code to initialize VRTOP, the top of the VR save area.
10558      This address is gr_save_area_bytes below GRTOP, rounded
10559      down to the next 16-byte boundary.  */
10560   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10561   vr_offset = ROUND_UP (gr_save_area_size,
10562                         STACK_BOUNDARY / BITS_PER_UNIT);
10563
10564   if (vr_offset)
10565     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10566   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10567   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10568
10569   /* Emit code to initialize GROFF, the offset from GRTOP of the
10570      next GPR argument.  */
10571   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10572               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10573   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10574
10575   /* Likewise emit code to initialize VROFF, the offset from FTOP
10576      of the next VR argument.  */
10577   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10578               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10579   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10580 }
10581
10582 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10583
10584 static tree
10585 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10586                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10587 {
10588   tree addr;
10589   bool indirect_p;
10590   bool is_ha;           /* is HFA or HVA.  */
10591   bool dw_align;        /* double-word align.  */
10592   machine_mode ag_mode = VOIDmode;
10593   int nregs;
10594   machine_mode mode;
10595
10596   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10597   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10598   HOST_WIDE_INT size, rsize, adjust, align;
10599   tree t, u, cond1, cond2;
10600
10601   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10602   if (indirect_p)
10603     type = build_pointer_type (type);
10604
10605   mode = TYPE_MODE (type);
10606
10607   f_stack = TYPE_FIELDS (va_list_type_node);
10608   f_grtop = DECL_CHAIN (f_stack);
10609   f_vrtop = DECL_CHAIN (f_grtop);
10610   f_groff = DECL_CHAIN (f_vrtop);
10611   f_vroff = DECL_CHAIN (f_groff);
10612
10613   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10614                   f_stack, NULL_TREE);
10615   size = int_size_in_bytes (type);
10616   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10617
10618   dw_align = false;
10619   adjust = 0;
10620   if (aarch64_vfp_is_call_or_return_candidate (mode,
10621                                                type,
10622                                                &ag_mode,
10623                                                &nregs,
10624                                                &is_ha))
10625     {
10626       /* TYPE passed in fp/simd registers.  */
10627       if (!TARGET_FLOAT)
10628         aarch64_err_no_fpadvsimd (mode, "varargs");
10629
10630       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10631                       unshare_expr (valist), f_vrtop, NULL_TREE);
10632       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10633                       unshare_expr (valist), f_vroff, NULL_TREE);
10634
10635       rsize = nregs * UNITS_PER_VREG;
10636
10637       if (is_ha)
10638         {
10639           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10640             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10641         }
10642       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10643                && size < UNITS_PER_VREG)
10644         {
10645           adjust = UNITS_PER_VREG - size;
10646         }
10647     }
10648   else
10649     {
10650       /* TYPE passed in general registers.  */
10651       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10652                       unshare_expr (valist), f_grtop, NULL_TREE);
10653       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10654                       unshare_expr (valist), f_groff, NULL_TREE);
10655       rsize = ROUND_UP (size, UNITS_PER_WORD);
10656       nregs = rsize / UNITS_PER_WORD;
10657
10658       if (align > 8)
10659         dw_align = true;
10660
10661       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10662           && size < UNITS_PER_WORD)
10663         {
10664           adjust = UNITS_PER_WORD  - size;
10665         }
10666     }
10667
10668   /* Get a local temporary for the field value.  */
10669   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10670
10671   /* Emit code to branch if off >= 0.  */
10672   t = build2 (GE_EXPR, boolean_type_node, off,
10673               build_int_cst (TREE_TYPE (off), 0));
10674   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10675
10676   if (dw_align)
10677     {
10678       /* Emit: offs = (offs + 15) & -16.  */
10679       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10680                   build_int_cst (TREE_TYPE (off), 15));
10681       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10682                   build_int_cst (TREE_TYPE (off), -16));
10683       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10684     }
10685   else
10686     roundup = NULL;
10687
10688   /* Update ap.__[g|v]r_offs  */
10689   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10690               build_int_cst (TREE_TYPE (off), rsize));
10691   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10692
10693   /* String up.  */
10694   if (roundup)
10695     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10696
10697   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10698   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10699               build_int_cst (TREE_TYPE (f_off), 0));
10700   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10701
10702   /* String up: make sure the assignment happens before the use.  */
10703   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10704   COND_EXPR_ELSE (cond1) = t;
10705
10706   /* Prepare the trees handling the argument that is passed on the stack;
10707      the top level node will store in ON_STACK.  */
10708   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10709   if (align > 8)
10710     {
10711       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10712       t = fold_convert (intDI_type_node, arg);
10713       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10714                   build_int_cst (TREE_TYPE (t), 15));
10715       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10716                   build_int_cst (TREE_TYPE (t), -16));
10717       t = fold_convert (TREE_TYPE (arg), t);
10718       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10719     }
10720   else
10721     roundup = NULL;
10722   /* Advance ap.__stack  */
10723   t = fold_convert (intDI_type_node, arg);
10724   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10725               build_int_cst (TREE_TYPE (t), size + 7));
10726   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10727               build_int_cst (TREE_TYPE (t), -8));
10728   t = fold_convert (TREE_TYPE (arg), t);
10729   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10730   /* String up roundup and advance.  */
10731   if (roundup)
10732     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10733   /* String up with arg */
10734   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10735   /* Big-endianness related address adjustment.  */
10736   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10737       && size < UNITS_PER_WORD)
10738   {
10739     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10740                 size_int (UNITS_PER_WORD - size));
10741     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10742   }
10743
10744   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10745   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10746
10747   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10748   t = off;
10749   if (adjust)
10750     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10751                 build_int_cst (TREE_TYPE (off), adjust));
10752
10753   t = fold_convert (sizetype, t);
10754   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10755
10756   if (is_ha)
10757     {
10758       /* type ha; // treat as "struct {ftype field[n];}"
10759          ... [computing offs]
10760          for (i = 0; i <nregs; ++i, offs += 16)
10761            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10762          return ha;  */
10763       int i;
10764       tree tmp_ha, field_t, field_ptr_t;
10765
10766       /* Declare a local variable.  */
10767       tmp_ha = create_tmp_var_raw (type, "ha");
10768       gimple_add_tmp_var (tmp_ha);
10769
10770       /* Establish the base type.  */
10771       switch (ag_mode)
10772         {
10773         case E_SFmode:
10774           field_t = float_type_node;
10775           field_ptr_t = float_ptr_type_node;
10776           break;
10777         case E_DFmode:
10778           field_t = double_type_node;
10779           field_ptr_t = double_ptr_type_node;
10780           break;
10781         case E_TFmode:
10782           field_t = long_double_type_node;
10783           field_ptr_t = long_double_ptr_type_node;
10784           break;
10785         case E_HFmode:
10786           field_t = aarch64_fp16_type_node;
10787           field_ptr_t = aarch64_fp16_ptr_type_node;
10788           break;
10789         case E_V2SImode:
10790         case E_V4SImode:
10791             {
10792               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10793               field_t = build_vector_type_for_mode (innertype, ag_mode);
10794               field_ptr_t = build_pointer_type (field_t);
10795             }
10796           break;
10797         default:
10798           gcc_assert (0);
10799         }
10800
10801       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10802       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10803       addr = t;
10804       t = fold_convert (field_ptr_t, addr);
10805       t = build2 (MODIFY_EXPR, field_t,
10806                   build1 (INDIRECT_REF, field_t, tmp_ha),
10807                   build1 (INDIRECT_REF, field_t, t));
10808
10809       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10810       for (i = 1; i < nregs; ++i)
10811         {
10812           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10813           u = fold_convert (field_ptr_t, addr);
10814           u = build2 (MODIFY_EXPR, field_t,
10815                       build2 (MEM_REF, field_t, tmp_ha,
10816                               build_int_cst (field_ptr_t,
10817                                              (i *
10818                                               int_size_in_bytes (field_t)))),
10819                       build1 (INDIRECT_REF, field_t, u));
10820           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10821         }
10822
10823       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10824       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10825     }
10826
10827   COND_EXPR_ELSE (cond2) = t;
10828   addr = fold_convert (build_pointer_type (type), cond1);
10829   addr = build_va_arg_indirect_ref (addr);
10830
10831   if (indirect_p)
10832     addr = build_va_arg_indirect_ref (addr);
10833
10834   return addr;
10835 }
10836
10837 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10838
10839 static void
10840 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10841                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10842                                 int no_rtl)
10843 {
10844   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10845   CUMULATIVE_ARGS local_cum;
10846   int gr_saved = cfun->va_list_gpr_size;
10847   int vr_saved = cfun->va_list_fpr_size;
10848
10849   /* The caller has advanced CUM up to, but not beyond, the last named
10850      argument.  Advance a local copy of CUM past the last "real" named
10851      argument, to find out how many registers are left over.  */
10852   local_cum = *cum;
10853   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10854
10855   /* Found out how many registers we need to save.
10856      Honor tree-stdvar analysis results.  */
10857   if (cfun->va_list_gpr_size)
10858     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10859                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10860   if (cfun->va_list_fpr_size)
10861     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10862                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10863
10864   if (!TARGET_FLOAT)
10865     {
10866       gcc_assert (local_cum.aapcs_nvrn == 0);
10867       vr_saved = 0;
10868     }
10869
10870   if (!no_rtl)
10871     {
10872       if (gr_saved > 0)
10873         {
10874           rtx ptr, mem;
10875
10876           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10877           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10878                                - gr_saved * UNITS_PER_WORD);
10879           mem = gen_frame_mem (BLKmode, ptr);
10880           set_mem_alias_set (mem, get_varargs_alias_set ());
10881
10882           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10883                                mem, gr_saved);
10884         }
10885       if (vr_saved > 0)
10886         {
10887           /* We can't use move_block_from_reg, because it will use
10888              the wrong mode, storing D regs only.  */
10889           machine_mode mode = TImode;
10890           int off, i, vr_start;
10891
10892           /* Set OFF to the offset from virtual_incoming_args_rtx of
10893              the first vector register.  The VR save area lies below
10894              the GR one, and is aligned to 16 bytes.  */
10895           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10896                            STACK_BOUNDARY / BITS_PER_UNIT);
10897           off -= vr_saved * UNITS_PER_VREG;
10898
10899           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10900           for (i = 0; i < vr_saved; ++i)
10901             {
10902               rtx ptr, mem;
10903
10904               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10905               mem = gen_frame_mem (mode, ptr);
10906               set_mem_alias_set (mem, get_varargs_alias_set ());
10907               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10908               off += UNITS_PER_VREG;
10909             }
10910         }
10911     }
10912
10913   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10914      any complication of having crtl->args.pretend_args_size changed.  */
10915   cfun->machine->frame.saved_varargs_size
10916     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10917                  STACK_BOUNDARY / BITS_PER_UNIT)
10918        + vr_saved * UNITS_PER_VREG);
10919 }
10920
10921 static void
10922 aarch64_conditional_register_usage (void)
10923 {
10924   int i;
10925   if (!TARGET_FLOAT)
10926     {
10927       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10928         {
10929           fixed_regs[i] = 1;
10930           call_used_regs[i] = 1;
10931         }
10932     }
10933 }
10934
10935 /* Walk down the type tree of TYPE counting consecutive base elements.
10936    If *MODEP is VOIDmode, then set it to the first valid floating point
10937    type.  If a non-floating point type is found, or if a floating point
10938    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10939    otherwise return the count in the sub-tree.  */
10940 static int
10941 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10942 {
10943   machine_mode mode;
10944   HOST_WIDE_INT size;
10945
10946   switch (TREE_CODE (type))
10947     {
10948     case REAL_TYPE:
10949       mode = TYPE_MODE (type);
10950       if (mode != DFmode && mode != SFmode
10951           && mode != TFmode && mode != HFmode)
10952         return -1;
10953
10954       if (*modep == VOIDmode)
10955         *modep = mode;
10956
10957       if (*modep == mode)
10958         return 1;
10959
10960       break;
10961
10962     case COMPLEX_TYPE:
10963       mode = TYPE_MODE (TREE_TYPE (type));
10964       if (mode != DFmode && mode != SFmode
10965           && mode != TFmode && mode != HFmode)
10966         return -1;
10967
10968       if (*modep == VOIDmode)
10969         *modep = mode;
10970
10971       if (*modep == mode)
10972         return 2;
10973
10974       break;
10975
10976     case VECTOR_TYPE:
10977       /* Use V2SImode and V4SImode as representatives of all 64-bit
10978          and 128-bit vector types.  */
10979       size = int_size_in_bytes (type);
10980       switch (size)
10981         {
10982         case 8:
10983           mode = V2SImode;
10984           break;
10985         case 16:
10986           mode = V4SImode;
10987           break;
10988         default:
10989           return -1;
10990         }
10991
10992       if (*modep == VOIDmode)
10993         *modep = mode;
10994
10995       /* Vector modes are considered to be opaque: two vectors are
10996          equivalent for the purposes of being homogeneous aggregates
10997          if they are the same size.  */
10998       if (*modep == mode)
10999         return 1;
11000
11001       break;
11002
11003     case ARRAY_TYPE:
11004       {
11005         int count;
11006         tree index = TYPE_DOMAIN (type);
11007
11008         /* Can't handle incomplete types nor sizes that are not
11009            fixed.  */
11010         if (!COMPLETE_TYPE_P (type)
11011             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11012           return -1;
11013
11014         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11015         if (count == -1
11016             || !index
11017             || !TYPE_MAX_VALUE (index)
11018             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11019             || !TYPE_MIN_VALUE (index)
11020             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11021             || count < 0)
11022           return -1;
11023
11024         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11025                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11026
11027         /* There must be no padding.  */
11028         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11029           return -1;
11030
11031         return count;
11032       }
11033
11034     case RECORD_TYPE:
11035       {
11036         int count = 0;
11037         int sub_count;
11038         tree field;
11039
11040         /* Can't handle incomplete types nor sizes that are not
11041            fixed.  */
11042         if (!COMPLETE_TYPE_P (type)
11043             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11044           return -1;
11045
11046         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11047           {
11048             if (TREE_CODE (field) != FIELD_DECL)
11049               continue;
11050
11051             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11052             if (sub_count < 0)
11053               return -1;
11054             count += sub_count;
11055           }
11056
11057         /* There must be no padding.  */
11058         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11059           return -1;
11060
11061         return count;
11062       }
11063
11064     case UNION_TYPE:
11065     case QUAL_UNION_TYPE:
11066       {
11067         /* These aren't very interesting except in a degenerate case.  */
11068         int count = 0;
11069         int sub_count;
11070         tree field;
11071
11072         /* Can't handle incomplete types nor sizes that are not
11073            fixed.  */
11074         if (!COMPLETE_TYPE_P (type)
11075             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11076           return -1;
11077
11078         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11079           {
11080             if (TREE_CODE (field) != FIELD_DECL)
11081               continue;
11082
11083             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11084             if (sub_count < 0)
11085               return -1;
11086             count = count > sub_count ? count : sub_count;
11087           }
11088
11089         /* There must be no padding.  */
11090         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11091           return -1;
11092
11093         return count;
11094       }
11095
11096     default:
11097       break;
11098     }
11099
11100   return -1;
11101 }
11102
11103 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11104    type as described in AAPCS64 \S 4.1.2.
11105
11106    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11107
11108 static bool
11109 aarch64_short_vector_p (const_tree type,
11110                         machine_mode mode)
11111 {
11112   HOST_WIDE_INT size = -1;
11113
11114   if (type && TREE_CODE (type) == VECTOR_TYPE)
11115     size = int_size_in_bytes (type);
11116   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11117             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11118     size = GET_MODE_SIZE (mode);
11119
11120   return (size == 8 || size == 16);
11121 }
11122
11123 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11124    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11125    array types.  The C99 floating-point complex types are also considered
11126    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11127    types, which are GCC extensions and out of the scope of AAPCS64, are
11128    treated as composite types here as well.
11129
11130    Note that MODE itself is not sufficient in determining whether a type
11131    is such a composite type or not.  This is because
11132    stor-layout.c:compute_record_mode may have already changed the MODE
11133    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11134    structure with only one field may have its MODE set to the mode of the
11135    field.  Also an integer mode whose size matches the size of the
11136    RECORD_TYPE type may be used to substitute the original mode
11137    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11138    solely relied on.  */
11139
11140 static bool
11141 aarch64_composite_type_p (const_tree type,
11142                           machine_mode mode)
11143 {
11144   if (aarch64_short_vector_p (type, mode))
11145     return false;
11146
11147   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11148     return true;
11149
11150   if (mode == BLKmode
11151       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11152       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11153     return true;
11154
11155   return false;
11156 }
11157
11158 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11159    shall be passed or returned in simd/fp register(s) (providing these
11160    parameter passing registers are available).
11161
11162    Upon successful return, *COUNT returns the number of needed registers,
11163    *BASE_MODE returns the mode of the individual register and when IS_HAF
11164    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11165    floating-point aggregate or a homogeneous short-vector aggregate.  */
11166
11167 static bool
11168 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11169                                          const_tree type,
11170                                          machine_mode *base_mode,
11171                                          int *count,
11172                                          bool *is_ha)
11173 {
11174   machine_mode new_mode = VOIDmode;
11175   bool composite_p = aarch64_composite_type_p (type, mode);
11176
11177   if (is_ha != NULL) *is_ha = false;
11178
11179   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11180       || aarch64_short_vector_p (type, mode))
11181     {
11182       *count = 1;
11183       new_mode = mode;
11184     }
11185   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11186     {
11187       if (is_ha != NULL) *is_ha = true;
11188       *count = 2;
11189       new_mode = GET_MODE_INNER (mode);
11190     }
11191   else if (type && composite_p)
11192     {
11193       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11194
11195       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11196         {
11197           if (is_ha != NULL) *is_ha = true;
11198           *count = ag_count;
11199         }
11200       else
11201         return false;
11202     }
11203   else
11204     return false;
11205
11206   *base_mode = new_mode;
11207   return true;
11208 }
11209
11210 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11211
11212 static rtx
11213 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11214                           int incoming ATTRIBUTE_UNUSED)
11215 {
11216   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11217 }
11218
11219 /* Implements target hook vector_mode_supported_p.  */
11220 static bool
11221 aarch64_vector_mode_supported_p (machine_mode mode)
11222 {
11223   if (TARGET_SIMD
11224       && (mode == V4SImode  || mode == V8HImode
11225           || mode == V16QImode || mode == V2DImode
11226           || mode == V2SImode  || mode == V4HImode
11227           || mode == V8QImode || mode == V2SFmode
11228           || mode == V4SFmode || mode == V2DFmode
11229           || mode == V4HFmode || mode == V8HFmode
11230           || mode == V1DFmode))
11231     return true;
11232
11233   return false;
11234 }
11235
11236 /* Return appropriate SIMD container
11237    for MODE within a vector of WIDTH bits.  */
11238 static machine_mode
11239 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11240 {
11241   gcc_assert (width == 64 || width == 128);
11242   if (TARGET_SIMD)
11243     {
11244       if (width == 128)
11245         switch (mode)
11246           {
11247           case E_DFmode:
11248             return V2DFmode;
11249           case E_SFmode:
11250             return V4SFmode;
11251           case E_HFmode:
11252             return V8HFmode;
11253           case E_SImode:
11254             return V4SImode;
11255           case E_HImode:
11256             return V8HImode;
11257           case E_QImode:
11258             return V16QImode;
11259           case E_DImode:
11260             return V2DImode;
11261           default:
11262             break;
11263           }
11264       else
11265         switch (mode)
11266           {
11267           case E_SFmode:
11268             return V2SFmode;
11269           case E_HFmode:
11270             return V4HFmode;
11271           case E_SImode:
11272             return V2SImode;
11273           case E_HImode:
11274             return V4HImode;
11275           case E_QImode:
11276             return V8QImode;
11277           default:
11278             break;
11279           }
11280     }
11281   return word_mode;
11282 }
11283
11284 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11285 static machine_mode
11286 aarch64_preferred_simd_mode (scalar_mode mode)
11287 {
11288   return aarch64_simd_container_mode (mode, 128);
11289 }
11290
11291 /* Return the bitmask of possible vector sizes for the vectorizer
11292    to iterate over.  */
11293 static unsigned int
11294 aarch64_autovectorize_vector_sizes (void)
11295 {
11296   return (16 | 8);
11297 }
11298
11299 /* Implement TARGET_MANGLE_TYPE.  */
11300
11301 static const char *
11302 aarch64_mangle_type (const_tree type)
11303 {
11304   /* The AArch64 ABI documents say that "__va_list" has to be
11305      managled as if it is in the "std" namespace.  */
11306   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11307     return "St9__va_list";
11308
11309   /* Half-precision float.  */
11310   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11311     return "Dh";
11312
11313   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11314      builtin types.  */
11315   if (TYPE_NAME (type) != NULL)
11316     return aarch64_mangle_builtin_type (type);
11317
11318   /* Use the default mangling.  */
11319   return NULL;
11320 }
11321
11322 /* Find the first rtx_insn before insn that will generate an assembly
11323    instruction.  */
11324
11325 static rtx_insn *
11326 aarch64_prev_real_insn (rtx_insn *insn)
11327 {
11328   if (!insn)
11329     return NULL;
11330
11331   do
11332     {
11333       insn = prev_real_insn (insn);
11334     }
11335   while (insn && recog_memoized (insn) < 0);
11336
11337   return insn;
11338 }
11339
11340 static bool
11341 is_madd_op (enum attr_type t1)
11342 {
11343   unsigned int i;
11344   /* A number of these may be AArch32 only.  */
11345   enum attr_type mlatypes[] = {
11346     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11347     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11348     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11349   };
11350
11351   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11352     {
11353       if (t1 == mlatypes[i])
11354         return true;
11355     }
11356
11357   return false;
11358 }
11359
11360 /* Check if there is a register dependency between a load and the insn
11361    for which we hold recog_data.  */
11362
11363 static bool
11364 dep_between_memop_and_curr (rtx memop)
11365 {
11366   rtx load_reg;
11367   int opno;
11368
11369   gcc_assert (GET_CODE (memop) == SET);
11370
11371   if (!REG_P (SET_DEST (memop)))
11372     return false;
11373
11374   load_reg = SET_DEST (memop);
11375   for (opno = 1; opno < recog_data.n_operands; opno++)
11376     {
11377       rtx operand = recog_data.operand[opno];
11378       if (REG_P (operand)
11379           && reg_overlap_mentioned_p (load_reg, operand))
11380         return true;
11381
11382     }
11383   return false;
11384 }
11385
11386
11387 /* When working around the Cortex-A53 erratum 835769,
11388    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11389    instruction and has a preceding memory instruction such that a NOP
11390    should be inserted between them.  */
11391
11392 bool
11393 aarch64_madd_needs_nop (rtx_insn* insn)
11394 {
11395   enum attr_type attr_type;
11396   rtx_insn *prev;
11397   rtx body;
11398
11399   if (!TARGET_FIX_ERR_A53_835769)
11400     return false;
11401
11402   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11403     return false;
11404
11405   attr_type = get_attr_type (insn);
11406   if (!is_madd_op (attr_type))
11407     return false;
11408
11409   prev = aarch64_prev_real_insn (insn);
11410   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11411      Restore recog state to INSN to avoid state corruption.  */
11412   extract_constrain_insn_cached (insn);
11413
11414   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11415     return false;
11416
11417   body = single_set (prev);
11418
11419   /* If the previous insn is a memory op and there is no dependency between
11420      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11421      have a complex memory operation, probably a load/store pair.
11422      Be conservative for now and emit a NOP.  */
11423   if (GET_MODE (recog_data.operand[0]) == DImode
11424       && (!body || !dep_between_memop_and_curr (body)))
11425     return true;
11426
11427   return false;
11428
11429 }
11430
11431
11432 /* Implement FINAL_PRESCAN_INSN.  */
11433
11434 void
11435 aarch64_final_prescan_insn (rtx_insn *insn)
11436 {
11437   if (aarch64_madd_needs_nop (insn))
11438     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11439 }
11440
11441
11442 /* Return the equivalent letter for size.  */
11443 static char
11444 sizetochar (int size)
11445 {
11446   switch (size)
11447     {
11448     case 64: return 'd';
11449     case 32: return 's';
11450     case 16: return 'h';
11451     case 8 : return 'b';
11452     default: gcc_unreachable ();
11453     }
11454 }
11455
11456 /* Return true iff x is a uniform vector of floating-point
11457    constants, and the constant can be represented in
11458    quarter-precision form.  Note, as aarch64_float_const_representable
11459    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11460 static bool
11461 aarch64_vect_float_const_representable_p (rtx x)
11462 {
11463   rtx elt;
11464   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11465           && const_vec_duplicate_p (x, &elt)
11466           && aarch64_float_const_representable_p (elt));
11467 }
11468
11469 /* Return true for valid and false for invalid.  */
11470 bool
11471 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11472                               struct simd_immediate_info *info)
11473 {
11474 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11475   matches = 1;                                          \
11476   for (i = 0; i < idx; i += (STRIDE))                   \
11477     if (!(TEST))                                        \
11478       matches = 0;                                      \
11479   if (matches)                                          \
11480     {                                                   \
11481       immtype = (CLASS);                                \
11482       elsize = (ELSIZE);                                \
11483       eshift = (SHIFT);                                 \
11484       emvn = (NEG);                                     \
11485       break;                                            \
11486     }
11487
11488   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11489   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11490   unsigned char bytes[16];
11491   int immtype = -1, matches;
11492   unsigned int invmask = inverse ? 0xff : 0;
11493   int eshift, emvn;
11494
11495   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11496     {
11497       if (! (aarch64_simd_imm_zero_p (op, mode)
11498              || aarch64_vect_float_const_representable_p (op)))
11499         return false;
11500
11501       if (info)
11502         {
11503           rtx elt = CONST_VECTOR_ELT (op, 0);
11504           scalar_float_mode elt_mode
11505             = as_a <scalar_float_mode> (GET_MODE (elt));
11506
11507           info->value = elt;
11508           info->element_width = GET_MODE_BITSIZE (elt_mode);
11509           info->mvn = false;
11510           info->shift = 0;
11511         }
11512
11513       return true;
11514     }
11515
11516   /* Splat vector constant out into a byte vector.  */
11517   for (i = 0; i < n_elts; i++)
11518     {
11519       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11520          it must be laid out in the vector register in reverse order.  */
11521       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11522       unsigned HOST_WIDE_INT elpart;
11523
11524       gcc_assert (CONST_INT_P (el));
11525       elpart = INTVAL (el);
11526
11527       for (unsigned int byte = 0; byte < innersize; byte++)
11528         {
11529           bytes[idx++] = (elpart & 0xff) ^ invmask;
11530           elpart >>= BITS_PER_UNIT;
11531         }
11532
11533     }
11534
11535   /* Sanity check.  */
11536   gcc_assert (idx == GET_MODE_SIZE (mode));
11537
11538   do
11539     {
11540       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11541              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11542
11543       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11544              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11545
11546       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11547              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11548
11549       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11550              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11551
11552       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11553
11554       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11555
11556       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11557              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11558
11559       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11560              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11561
11562       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11563              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11564
11565       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11566              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11567
11568       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11569
11570       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11571
11572       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11573              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11574
11575       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11576              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11577
11578       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11579              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11580
11581       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11582              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11583
11584       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11585
11586       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11587              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11588     }
11589   while (0);
11590
11591   if (immtype == -1)
11592     return false;
11593
11594   if (info)
11595     {
11596       info->element_width = elsize;
11597       info->mvn = emvn != 0;
11598       info->shift = eshift;
11599
11600       unsigned HOST_WIDE_INT imm = 0;
11601
11602       if (immtype >= 12 && immtype <= 15)
11603         info->msl = true;
11604
11605       /* Un-invert bytes of recognized vector, if necessary.  */
11606       if (invmask != 0)
11607         for (i = 0; i < idx; i++)
11608           bytes[i] ^= invmask;
11609
11610       if (immtype == 17)
11611         {
11612           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11613           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11614
11615           for (i = 0; i < 8; i++)
11616             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11617               << (i * BITS_PER_UNIT);
11618
11619
11620           info->value = GEN_INT (imm);
11621         }
11622       else
11623         {
11624           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11625             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11626
11627           /* Construct 'abcdefgh' because the assembler cannot handle
11628              generic constants.  */
11629           if (info->mvn)
11630             imm = ~imm;
11631           imm = (imm >> info->shift) & 0xff;
11632           info->value = GEN_INT (imm);
11633         }
11634     }
11635
11636   return true;
11637 #undef CHECK
11638 }
11639
11640 /* Check of immediate shift constants are within range.  */
11641 bool
11642 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11643 {
11644   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11645   if (left)
11646     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11647   else
11648     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11649 }
11650
11651 /* Return true if X is a uniform vector where all elements
11652    are either the floating-point constant 0.0 or the
11653    integer constant 0.  */
11654 bool
11655 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11656 {
11657   return x == CONST0_RTX (mode);
11658 }
11659
11660
11661 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11662    operation of width WIDTH at bit position POS.  */
11663
11664 rtx
11665 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11666 {
11667   gcc_assert (CONST_INT_P (width));
11668   gcc_assert (CONST_INT_P (pos));
11669
11670   unsigned HOST_WIDE_INT mask
11671     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11672   return GEN_INT (mask << UINTVAL (pos));
11673 }
11674
11675 bool
11676 aarch64_mov_operand_p (rtx x, machine_mode mode)
11677 {
11678   if (GET_CODE (x) == HIGH
11679       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11680     return true;
11681
11682   if (CONST_INT_P (x))
11683     return true;
11684
11685   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11686     return true;
11687
11688   return aarch64_classify_symbolic_expression (x)
11689     == SYMBOL_TINY_ABSOLUTE;
11690 }
11691
11692 /* Return a const_int vector of VAL.  */
11693 rtx
11694 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11695 {
11696   int nunits = GET_MODE_NUNITS (mode);
11697   rtvec v = rtvec_alloc (nunits);
11698   int i;
11699
11700   rtx cache = GEN_INT (val);
11701
11702   for (i=0; i < nunits; i++)
11703     RTVEC_ELT (v, i) = cache;
11704
11705   return gen_rtx_CONST_VECTOR (mode, v);
11706 }
11707
11708 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11709
11710 bool
11711 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11712 {
11713   machine_mode vmode;
11714
11715   vmode = aarch64_preferred_simd_mode (mode);
11716   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11717   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11718 }
11719
11720 /* Construct and return a PARALLEL RTX vector with elements numbering the
11721    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11722    the vector - from the perspective of the architecture.  This does not
11723    line up with GCC's perspective on lane numbers, so we end up with
11724    different masks depending on our target endian-ness.  The diagram
11725    below may help.  We must draw the distinction when building masks
11726    which select one half of the vector.  An instruction selecting
11727    architectural low-lanes for a big-endian target, must be described using
11728    a mask selecting GCC high-lanes.
11729
11730                  Big-Endian             Little-Endian
11731
11732 GCC             0   1   2   3           3   2   1   0
11733               | x | x | x | x |       | x | x | x | x |
11734 Architecture    3   2   1   0           3   2   1   0
11735
11736 Low Mask:         { 2, 3 }                { 0, 1 }
11737 High Mask:        { 0, 1 }                { 2, 3 }
11738 */
11739
11740 rtx
11741 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11742 {
11743   int nunits = GET_MODE_NUNITS (mode);
11744   rtvec v = rtvec_alloc (nunits / 2);
11745   int high_base = nunits / 2;
11746   int low_base = 0;
11747   int base;
11748   rtx t1;
11749   int i;
11750
11751   if (BYTES_BIG_ENDIAN)
11752     base = high ? low_base : high_base;
11753   else
11754     base = high ? high_base : low_base;
11755
11756   for (i = 0; i < nunits / 2; i++)
11757     RTVEC_ELT (v, i) = GEN_INT (base + i);
11758
11759   t1 = gen_rtx_PARALLEL (mode, v);
11760   return t1;
11761 }
11762
11763 /* Check OP for validity as a PARALLEL RTX vector with elements
11764    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11765    from the perspective of the architecture.  See the diagram above
11766    aarch64_simd_vect_par_cnst_half for more details.  */
11767
11768 bool
11769 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11770                                        bool high)
11771 {
11772   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11773   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11774   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11775   int i = 0;
11776
11777   if (!VECTOR_MODE_P (mode))
11778     return false;
11779
11780   if (count_op != count_ideal)
11781     return false;
11782
11783   for (i = 0; i < count_ideal; i++)
11784     {
11785       rtx elt_op = XVECEXP (op, 0, i);
11786       rtx elt_ideal = XVECEXP (ideal, 0, i);
11787
11788       if (!CONST_INT_P (elt_op)
11789           || INTVAL (elt_ideal) != INTVAL (elt_op))
11790         return false;
11791     }
11792   return true;
11793 }
11794
11795 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11796    HIGH (exclusive).  */
11797 void
11798 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11799                           const_tree exp)
11800 {
11801   HOST_WIDE_INT lane;
11802   gcc_assert (CONST_INT_P (operand));
11803   lane = INTVAL (operand);
11804
11805   if (lane < low || lane >= high)
11806   {
11807     if (exp)
11808       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11809     else
11810       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11811   }
11812 }
11813
11814 /* Return TRUE if OP is a valid vector addressing mode.  */
11815 bool
11816 aarch64_simd_mem_operand_p (rtx op)
11817 {
11818   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11819                         || REG_P (XEXP (op, 0)));
11820 }
11821
11822 /* Emit a register copy from operand to operand, taking care not to
11823    early-clobber source registers in the process.
11824
11825    COUNT is the number of components into which the copy needs to be
11826    decomposed.  */
11827 void
11828 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11829                                 unsigned int count)
11830 {
11831   unsigned int i;
11832   int rdest = REGNO (operands[0]);
11833   int rsrc = REGNO (operands[1]);
11834
11835   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11836       || rdest < rsrc)
11837     for (i = 0; i < count; i++)
11838       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11839                       gen_rtx_REG (mode, rsrc + i));
11840   else
11841     for (i = 0; i < count; i++)
11842       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11843                       gen_rtx_REG (mode, rsrc + count - i - 1));
11844 }
11845
11846 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11847    one of VSTRUCT modes: OI, CI, or XI.  */
11848 int
11849 aarch64_simd_attr_length_rglist (machine_mode mode)
11850 {
11851   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11852 }
11853
11854 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11855    alignment of a vector to 128 bits.  */
11856 static HOST_WIDE_INT
11857 aarch64_simd_vector_alignment (const_tree type)
11858 {
11859   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11860   return MIN (align, 128);
11861 }
11862
11863 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11864 static bool
11865 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11866 {
11867   if (is_packed)
11868     return false;
11869
11870   /* We guarantee alignment for vectors up to 128-bits.  */
11871   if (tree_int_cst_compare (TYPE_SIZE (type),
11872                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11873     return false;
11874
11875   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11876   return true;
11877 }
11878
11879 /* Return true if the vector misalignment factor is supported by the
11880    target.  */
11881 static bool
11882 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11883                                              const_tree type, int misalignment,
11884                                              bool is_packed)
11885 {
11886   if (TARGET_SIMD && STRICT_ALIGNMENT)
11887     {
11888       /* Return if movmisalign pattern is not supported for this mode.  */
11889       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11890         return false;
11891
11892       if (misalignment == -1)
11893         {
11894           /* Misalignment factor is unknown at compile time but we know
11895              it's word aligned.  */
11896           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11897             {
11898               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11899
11900               if (element_size != 64)
11901                 return true;
11902             }
11903           return false;
11904         }
11905     }
11906   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11907                                                       is_packed);
11908 }
11909
11910 /* If VALS is a vector constant that can be loaded into a register
11911    using DUP, generate instructions to do so and return an RTX to
11912    assign to the register.  Otherwise return NULL_RTX.  */
11913 static rtx
11914 aarch64_simd_dup_constant (rtx vals)
11915 {
11916   machine_mode mode = GET_MODE (vals);
11917   machine_mode inner_mode = GET_MODE_INNER (mode);
11918   rtx x;
11919
11920   if (!const_vec_duplicate_p (vals, &x))
11921     return NULL_RTX;
11922
11923   /* We can load this constant by using DUP and a constant in a
11924      single ARM register.  This will be cheaper than a vector
11925      load.  */
11926   x = copy_to_mode_reg (inner_mode, x);
11927   return gen_rtx_VEC_DUPLICATE (mode, x);
11928 }
11929
11930
11931 /* Generate code to load VALS, which is a PARALLEL containing only
11932    constants (for vec_init) or CONST_VECTOR, efficiently into a
11933    register.  Returns an RTX to copy into the register, or NULL_RTX
11934    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11935 static rtx
11936 aarch64_simd_make_constant (rtx vals)
11937 {
11938   machine_mode mode = GET_MODE (vals);
11939   rtx const_dup;
11940   rtx const_vec = NULL_RTX;
11941   int n_elts = GET_MODE_NUNITS (mode);
11942   int n_const = 0;
11943   int i;
11944
11945   if (GET_CODE (vals) == CONST_VECTOR)
11946     const_vec = vals;
11947   else if (GET_CODE (vals) == PARALLEL)
11948     {
11949       /* A CONST_VECTOR must contain only CONST_INTs and
11950          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11951          Only store valid constants in a CONST_VECTOR.  */
11952       for (i = 0; i < n_elts; ++i)
11953         {
11954           rtx x = XVECEXP (vals, 0, i);
11955           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11956             n_const++;
11957         }
11958       if (n_const == n_elts)
11959         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11960     }
11961   else
11962     gcc_unreachable ();
11963
11964   if (const_vec != NULL_RTX
11965       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11966     /* Load using MOVI/MVNI.  */
11967     return const_vec;
11968   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11969     /* Loaded using DUP.  */
11970     return const_dup;
11971   else if (const_vec != NULL_RTX)
11972     /* Load from constant pool. We can not take advantage of single-cycle
11973        LD1 because we need a PC-relative addressing mode.  */
11974     return const_vec;
11975   else
11976     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11977        We can not construct an initializer.  */
11978     return NULL_RTX;
11979 }
11980
11981 /* Expand a vector initialisation sequence, such that TARGET is
11982    initialised to contain VALS.  */
11983
11984 void
11985 aarch64_expand_vector_init (rtx target, rtx vals)
11986 {
11987   machine_mode mode = GET_MODE (target);
11988   scalar_mode inner_mode = GET_MODE_INNER (mode);
11989   /* The number of vector elements.  */
11990   int n_elts = GET_MODE_NUNITS (mode);
11991   /* The number of vector elements which are not constant.  */
11992   int n_var = 0;
11993   rtx any_const = NULL_RTX;
11994   /* The first element of vals.  */
11995   rtx v0 = XVECEXP (vals, 0, 0);
11996   bool all_same = true;
11997
11998   /* Count the number of variable elements to initialise.  */
11999   for (int i = 0; i < n_elts; ++i)
12000     {
12001       rtx x = XVECEXP (vals, 0, i);
12002       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12003         ++n_var;
12004       else
12005         any_const = x;
12006
12007       all_same &= rtx_equal_p (x, v0);
12008     }
12009
12010   /* No variable elements, hand off to aarch64_simd_make_constant which knows
12011      how best to handle this.  */
12012   if (n_var == 0)
12013     {
12014       rtx constant = aarch64_simd_make_constant (vals);
12015       if (constant != NULL_RTX)
12016         {
12017           emit_move_insn (target, constant);
12018           return;
12019         }
12020     }
12021
12022   /* Splat a single non-constant element if we can.  */
12023   if (all_same)
12024     {
12025       rtx x = copy_to_mode_reg (inner_mode, v0);
12026       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12027       return;
12028     }
12029
12030   enum insn_code icode = optab_handler (vec_set_optab, mode);
12031   gcc_assert (icode != CODE_FOR_nothing);
12032
12033   /* If there are only variable elements, try to optimize
12034      the insertion using dup for the most common element
12035      followed by insertions.  */
12036
12037   /* The algorithm will fill matches[*][0] with the earliest matching element,
12038      and matches[X][1] with the count of duplicate elements (if X is the
12039      earliest element which has duplicates).  */
12040
12041   if (n_var == n_elts && n_elts <= 16)
12042     {
12043       int matches[16][2] = {0};
12044       for (int i = 0; i < n_elts; i++)
12045         {
12046           for (int j = 0; j <= i; j++)
12047             {
12048               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12049                 {
12050                   matches[i][0] = j;
12051                   matches[j][1]++;
12052                   break;
12053                 }
12054             }
12055         }
12056       int maxelement = 0;
12057       int maxv = 0;
12058       for (int i = 0; i < n_elts; i++)
12059         if (matches[i][1] > maxv)
12060           {
12061             maxelement = i;
12062             maxv = matches[i][1];
12063           }
12064
12065       /* Create a duplicate of the most common element.  */
12066       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12067       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12068
12069       /* Insert the rest.  */
12070       for (int i = 0; i < n_elts; i++)
12071         {
12072           rtx x = XVECEXP (vals, 0, i);
12073           if (matches[i][0] == maxelement)
12074             continue;
12075           x = copy_to_mode_reg (inner_mode, x);
12076           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12077         }
12078       return;
12079     }
12080
12081   /* Initialise a vector which is part-variable.  We want to first try
12082      to build those lanes which are constant in the most efficient way we
12083      can.  */
12084   if (n_var != n_elts)
12085     {
12086       rtx copy = copy_rtx (vals);
12087
12088       /* Load constant part of vector.  We really don't care what goes into the
12089          parts we will overwrite, but we're more likely to be able to load the
12090          constant efficiently if it has fewer, larger, repeating parts
12091          (see aarch64_simd_valid_immediate).  */
12092       for (int i = 0; i < n_elts; i++)
12093         {
12094           rtx x = XVECEXP (vals, 0, i);
12095           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12096             continue;
12097           rtx subst = any_const;
12098           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12099             {
12100               /* Look in the copied vector, as more elements are const.  */
12101               rtx test = XVECEXP (copy, 0, i ^ bit);
12102               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12103                 {
12104                   subst = test;
12105                   break;
12106                 }
12107             }
12108           XVECEXP (copy, 0, i) = subst;
12109         }
12110       aarch64_expand_vector_init (target, copy);
12111     }
12112
12113   /* Insert the variable lanes directly.  */
12114   for (int i = 0; i < n_elts; i++)
12115     {
12116       rtx x = XVECEXP (vals, 0, i);
12117       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12118         continue;
12119       x = copy_to_mode_reg (inner_mode, x);
12120       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12121     }
12122 }
12123
12124 static unsigned HOST_WIDE_INT
12125 aarch64_shift_truncation_mask (machine_mode mode)
12126 {
12127   return
12128     (!SHIFT_COUNT_TRUNCATED
12129      || aarch64_vector_mode_supported_p (mode)
12130      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12131 }
12132
12133 /* Select a format to encode pointers in exception handling data.  */
12134 int
12135 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12136 {
12137    int type;
12138    switch (aarch64_cmodel)
12139      {
12140      case AARCH64_CMODEL_TINY:
12141      case AARCH64_CMODEL_TINY_PIC:
12142      case AARCH64_CMODEL_SMALL:
12143      case AARCH64_CMODEL_SMALL_PIC:
12144      case AARCH64_CMODEL_SMALL_SPIC:
12145        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12146           for everything.  */
12147        type = DW_EH_PE_sdata4;
12148        break;
12149      default:
12150        /* No assumptions here.  8-byte relocs required.  */
12151        type = DW_EH_PE_sdata8;
12152        break;
12153      }
12154    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12155 }
12156
12157 /* The last .arch and .tune assembly strings that we printed.  */
12158 static std::string aarch64_last_printed_arch_string;
12159 static std::string aarch64_last_printed_tune_string;
12160
12161 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12162    by the function fndecl.  */
12163
12164 void
12165 aarch64_declare_function_name (FILE *stream, const char* name,
12166                                 tree fndecl)
12167 {
12168   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12169
12170   struct cl_target_option *targ_options;
12171   if (target_parts)
12172     targ_options = TREE_TARGET_OPTION (target_parts);
12173   else
12174     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12175   gcc_assert (targ_options);
12176
12177   const struct processor *this_arch
12178     = aarch64_get_arch (targ_options->x_explicit_arch);
12179
12180   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12181   std::string extension
12182     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12183                                                   this_arch->flags);
12184   /* Only update the assembler .arch string if it is distinct from the last
12185      such string we printed.  */
12186   std::string to_print = this_arch->name + extension;
12187   if (to_print != aarch64_last_printed_arch_string)
12188     {
12189       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12190       aarch64_last_printed_arch_string = to_print;
12191     }
12192
12193   /* Print the cpu name we're tuning for in the comments, might be
12194      useful to readers of the generated asm.  Do it only when it changes
12195      from function to function and verbose assembly is requested.  */
12196   const struct processor *this_tune
12197     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12198
12199   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12200     {
12201       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12202                    this_tune->name);
12203       aarch64_last_printed_tune_string = this_tune->name;
12204     }
12205
12206   /* Don't forget the type directive for ELF.  */
12207   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12208   ASM_OUTPUT_LABEL (stream, name);
12209 }
12210
12211 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12212
12213 static void
12214 aarch64_start_file (void)
12215 {
12216   struct cl_target_option *default_options
12217     = TREE_TARGET_OPTION (target_option_default_node);
12218
12219   const struct processor *default_arch
12220     = aarch64_get_arch (default_options->x_explicit_arch);
12221   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12222   std::string extension
12223     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12224                                                   default_arch->flags);
12225
12226    aarch64_last_printed_arch_string = default_arch->name + extension;
12227    aarch64_last_printed_tune_string = "";
12228    asm_fprintf (asm_out_file, "\t.arch %s\n",
12229                 aarch64_last_printed_arch_string.c_str ());
12230
12231    default_file_start ();
12232 }
12233
12234 /* Emit load exclusive.  */
12235
12236 static void
12237 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12238                              rtx mem, rtx model_rtx)
12239 {
12240   rtx (*gen) (rtx, rtx, rtx);
12241
12242   switch (mode)
12243     {
12244     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12245     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12246     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12247     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12248     default:
12249       gcc_unreachable ();
12250     }
12251
12252   emit_insn (gen (rval, mem, model_rtx));
12253 }
12254
12255 /* Emit store exclusive.  */
12256
12257 static void
12258 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12259                               rtx rval, rtx mem, rtx model_rtx)
12260 {
12261   rtx (*gen) (rtx, rtx, rtx, rtx);
12262
12263   switch (mode)
12264     {
12265     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12266     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12267     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12268     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12269     default:
12270       gcc_unreachable ();
12271     }
12272
12273   emit_insn (gen (bval, rval, mem, model_rtx));
12274 }
12275
12276 /* Mark the previous jump instruction as unlikely.  */
12277
12278 static void
12279 aarch64_emit_unlikely_jump (rtx insn)
12280 {
12281   rtx_insn *jump = emit_jump_insn (insn);
12282   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12283 }
12284
12285 /* Expand a compare and swap pattern.  */
12286
12287 void
12288 aarch64_expand_compare_and_swap (rtx operands[])
12289 {
12290   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12291   machine_mode mode, cmp_mode;
12292   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12293   int idx;
12294   gen_cas_fn gen;
12295   const gen_cas_fn split_cas[] =
12296   {
12297     gen_aarch64_compare_and_swapqi,
12298     gen_aarch64_compare_and_swaphi,
12299     gen_aarch64_compare_and_swapsi,
12300     gen_aarch64_compare_and_swapdi
12301   };
12302   const gen_cas_fn atomic_cas[] =
12303   {
12304     gen_aarch64_compare_and_swapqi_lse,
12305     gen_aarch64_compare_and_swaphi_lse,
12306     gen_aarch64_compare_and_swapsi_lse,
12307     gen_aarch64_compare_and_swapdi_lse
12308   };
12309
12310   bval = operands[0];
12311   rval = operands[1];
12312   mem = operands[2];
12313   oldval = operands[3];
12314   newval = operands[4];
12315   is_weak = operands[5];
12316   mod_s = operands[6];
12317   mod_f = operands[7];
12318   mode = GET_MODE (mem);
12319   cmp_mode = mode;
12320
12321   /* Normally the succ memory model must be stronger than fail, but in the
12322      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12323      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12324
12325   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12326       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12327     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12328
12329   switch (mode)
12330     {
12331     case E_QImode:
12332     case E_HImode:
12333       /* For short modes, we're going to perform the comparison in SImode,
12334          so do the zero-extension now.  */
12335       cmp_mode = SImode;
12336       rval = gen_reg_rtx (SImode);
12337       oldval = convert_modes (SImode, mode, oldval, true);
12338       /* Fall through.  */
12339
12340     case E_SImode:
12341     case E_DImode:
12342       /* Force the value into a register if needed.  */
12343       if (!aarch64_plus_operand (oldval, mode))
12344         oldval = force_reg (cmp_mode, oldval);
12345       break;
12346
12347     default:
12348       gcc_unreachable ();
12349     }
12350
12351   switch (mode)
12352     {
12353     case E_QImode: idx = 0; break;
12354     case E_HImode: idx = 1; break;
12355     case E_SImode: idx = 2; break;
12356     case E_DImode: idx = 3; break;
12357     default:
12358       gcc_unreachable ();
12359     }
12360   if (TARGET_LSE)
12361     gen = atomic_cas[idx];
12362   else
12363     gen = split_cas[idx];
12364
12365   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12366
12367   if (mode == QImode || mode == HImode)
12368     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12369
12370   x = gen_rtx_REG (CCmode, CC_REGNUM);
12371   x = gen_rtx_EQ (SImode, x, const0_rtx);
12372   emit_insn (gen_rtx_SET (bval, x));
12373 }
12374
12375 /* Test whether the target supports using a atomic load-operate instruction.
12376    CODE is the operation and AFTER is TRUE if the data in memory after the
12377    operation should be returned and FALSE if the data before the operation
12378    should be returned.  Returns FALSE if the operation isn't supported by the
12379    architecture.  */
12380
12381 bool
12382 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12383 {
12384   if (!TARGET_LSE)
12385     return false;
12386
12387   switch (code)
12388     {
12389     case SET:
12390     case AND:
12391     case IOR:
12392     case XOR:
12393     case MINUS:
12394     case PLUS:
12395       return true;
12396     default:
12397       return false;
12398     }
12399 }
12400
12401 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12402    sequence implementing an atomic operation.  */
12403
12404 static void
12405 aarch64_emit_post_barrier (enum memmodel model)
12406 {
12407   const enum memmodel base_model = memmodel_base (model);
12408
12409   if (is_mm_sync (model)
12410       && (base_model == MEMMODEL_ACQUIRE
12411           || base_model == MEMMODEL_ACQ_REL
12412           || base_model == MEMMODEL_SEQ_CST))
12413     {
12414       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12415     }
12416 }
12417
12418 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12419    for the data in memory.  EXPECTED is the value expected to be in memory.
12420    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12421    is the memory ordering to use.  */
12422
12423 void
12424 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12425                         rtx expected, rtx desired,
12426                         rtx model)
12427 {
12428   rtx (*gen) (rtx, rtx, rtx, rtx);
12429   machine_mode mode;
12430
12431   mode = GET_MODE (mem);
12432
12433   switch (mode)
12434     {
12435     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12436     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12437     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12438     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12439     default:
12440       gcc_unreachable ();
12441     }
12442
12443   /* Move the expected value into the CAS destination register.  */
12444   emit_insn (gen_rtx_SET (rval, expected));
12445
12446   /* Emit the CAS.  */
12447   emit_insn (gen (rval, mem, desired, model));
12448
12449   /* Compare the expected value with the value loaded by the CAS, to establish
12450      whether the swap was made.  */
12451   aarch64_gen_compare_reg (EQ, rval, expected);
12452 }
12453
12454 /* Split a compare and swap pattern.  */
12455
12456 void
12457 aarch64_split_compare_and_swap (rtx operands[])
12458 {
12459   rtx rval, mem, oldval, newval, scratch;
12460   machine_mode mode;
12461   bool is_weak;
12462   rtx_code_label *label1, *label2;
12463   rtx x, cond;
12464   enum memmodel model;
12465   rtx model_rtx;
12466
12467   rval = operands[0];
12468   mem = operands[1];
12469   oldval = operands[2];
12470   newval = operands[3];
12471   is_weak = (operands[4] != const0_rtx);
12472   model_rtx = operands[5];
12473   scratch = operands[7];
12474   mode = GET_MODE (mem);
12475   model = memmodel_from_int (INTVAL (model_rtx));
12476
12477   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12478     loop:
12479     .label1:
12480         LD[A]XR rval, [mem]
12481         CBNZ    rval, .label2
12482         ST[L]XR scratch, newval, [mem]
12483         CBNZ    scratch, .label1
12484     .label2:
12485         CMP     rval, 0.  */
12486   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12487
12488   label1 = NULL;
12489   if (!is_weak)
12490     {
12491       label1 = gen_label_rtx ();
12492       emit_label (label1);
12493     }
12494   label2 = gen_label_rtx ();
12495
12496   /* The initial load can be relaxed for a __sync operation since a final
12497      barrier will be emitted to stop code hoisting.  */
12498   if (is_mm_sync (model))
12499     aarch64_emit_load_exclusive (mode, rval, mem,
12500                                  GEN_INT (MEMMODEL_RELAXED));
12501   else
12502     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12503
12504   if (strong_zero_p)
12505     {
12506       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12507       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12508                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12509       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12510     }
12511   else
12512     {
12513       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12514       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12515       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12516                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12517       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12518     }
12519
12520   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12521
12522   if (!is_weak)
12523     {
12524       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12525       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12526                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12527       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12528     }
12529   else
12530     {
12531       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12532       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12533       emit_insn (gen_rtx_SET (cond, x));
12534     }
12535
12536   emit_label (label2);
12537   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12538      to set the condition flags.  If this is not used it will be removed by
12539      later passes.  */
12540   if (strong_zero_p)
12541     {
12542       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12543       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12544       emit_insn (gen_rtx_SET (cond, x));
12545     }
12546   /* Emit any final barrier needed for a __sync operation.  */
12547   if (is_mm_sync (model))
12548     aarch64_emit_post_barrier (model);
12549 }
12550
12551 /* Emit a BIC instruction.  */
12552
12553 static void
12554 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12555 {
12556   rtx shift_rtx = GEN_INT (shift);
12557   rtx (*gen) (rtx, rtx, rtx, rtx);
12558
12559   switch (mode)
12560     {
12561     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12562     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12563     default:
12564       gcc_unreachable ();
12565     }
12566
12567   emit_insn (gen (dst, s2, shift_rtx, s1));
12568 }
12569
12570 /* Emit an atomic swap.  */
12571
12572 static void
12573 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12574                           rtx mem, rtx model)
12575 {
12576   rtx (*gen) (rtx, rtx, rtx, rtx);
12577
12578   switch (mode)
12579     {
12580     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12581     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12582     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12583     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12584     default:
12585       gcc_unreachable ();
12586     }
12587
12588   emit_insn (gen (dst, mem, value, model));
12589 }
12590
12591 /* Operations supported by aarch64_emit_atomic_load_op.  */
12592
12593 enum aarch64_atomic_load_op_code
12594 {
12595   AARCH64_LDOP_PLUS,    /* A + B  */
12596   AARCH64_LDOP_XOR,     /* A ^ B  */
12597   AARCH64_LDOP_OR,      /* A | B  */
12598   AARCH64_LDOP_BIC      /* A & ~B  */
12599 };
12600
12601 /* Emit an atomic load-operate.  */
12602
12603 static void
12604 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12605                              machine_mode mode, rtx dst, rtx src,
12606                              rtx mem, rtx model)
12607 {
12608   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12609   const aarch64_atomic_load_op_fn plus[] =
12610   {
12611     gen_aarch64_atomic_loadaddqi,
12612     gen_aarch64_atomic_loadaddhi,
12613     gen_aarch64_atomic_loadaddsi,
12614     gen_aarch64_atomic_loadadddi
12615   };
12616   const aarch64_atomic_load_op_fn eor[] =
12617   {
12618     gen_aarch64_atomic_loadeorqi,
12619     gen_aarch64_atomic_loadeorhi,
12620     gen_aarch64_atomic_loadeorsi,
12621     gen_aarch64_atomic_loadeordi
12622   };
12623   const aarch64_atomic_load_op_fn ior[] =
12624   {
12625     gen_aarch64_atomic_loadsetqi,
12626     gen_aarch64_atomic_loadsethi,
12627     gen_aarch64_atomic_loadsetsi,
12628     gen_aarch64_atomic_loadsetdi
12629   };
12630   const aarch64_atomic_load_op_fn bic[] =
12631   {
12632     gen_aarch64_atomic_loadclrqi,
12633     gen_aarch64_atomic_loadclrhi,
12634     gen_aarch64_atomic_loadclrsi,
12635     gen_aarch64_atomic_loadclrdi
12636   };
12637   aarch64_atomic_load_op_fn gen;
12638   int idx = 0;
12639
12640   switch (mode)
12641     {
12642     case E_QImode: idx = 0; break;
12643     case E_HImode: idx = 1; break;
12644     case E_SImode: idx = 2; break;
12645     case E_DImode: idx = 3; break;
12646     default:
12647       gcc_unreachable ();
12648     }
12649
12650   switch (code)
12651     {
12652     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12653     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12654     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12655     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12656     default:
12657       gcc_unreachable ();
12658     }
12659
12660   emit_insn (gen (dst, mem, src, model));
12661 }
12662
12663 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12664    location to store the data read from memory.  OUT_RESULT is the location to
12665    store the result of the operation.  MEM is the memory location to read and
12666    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12667    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12668    be NULL.  */
12669
12670 void
12671 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12672                          rtx mem, rtx value, rtx model_rtx)
12673 {
12674   machine_mode mode = GET_MODE (mem);
12675   machine_mode wmode = (mode == DImode ? DImode : SImode);
12676   const bool short_mode = (mode < SImode);
12677   aarch64_atomic_load_op_code ldop_code;
12678   rtx src;
12679   rtx x;
12680
12681   if (out_data)
12682     out_data = gen_lowpart (mode, out_data);
12683
12684   if (out_result)
12685     out_result = gen_lowpart (mode, out_result);
12686
12687   /* Make sure the value is in a register, putting it into a destination
12688      register if it needs to be manipulated.  */
12689   if (!register_operand (value, mode)
12690       || code == AND || code == MINUS)
12691     {
12692       src = out_result ? out_result : out_data;
12693       emit_move_insn (src, gen_lowpart (mode, value));
12694     }
12695   else
12696     src = value;
12697   gcc_assert (register_operand (src, mode));
12698
12699   /* Preprocess the data for the operation as necessary.  If the operation is
12700      a SET then emit a swap instruction and finish.  */
12701   switch (code)
12702     {
12703     case SET:
12704       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12705       return;
12706
12707     case MINUS:
12708       /* Negate the value and treat it as a PLUS.  */
12709       {
12710         rtx neg_src;
12711
12712         /* Resize the value if necessary.  */
12713         if (short_mode)
12714           src = gen_lowpart (wmode, src);
12715
12716         neg_src = gen_rtx_NEG (wmode, src);
12717         emit_insn (gen_rtx_SET (src, neg_src));
12718
12719         if (short_mode)
12720           src = gen_lowpart (mode, src);
12721       }
12722       /* Fall-through.  */
12723     case PLUS:
12724       ldop_code = AARCH64_LDOP_PLUS;
12725       break;
12726
12727     case IOR:
12728       ldop_code = AARCH64_LDOP_OR;
12729       break;
12730
12731     case XOR:
12732       ldop_code = AARCH64_LDOP_XOR;
12733       break;
12734
12735     case AND:
12736       {
12737         rtx not_src;
12738
12739         /* Resize the value if necessary.  */
12740         if (short_mode)
12741           src = gen_lowpart (wmode, src);
12742
12743         not_src = gen_rtx_NOT (wmode, src);
12744         emit_insn (gen_rtx_SET (src, not_src));
12745
12746         if (short_mode)
12747           src = gen_lowpart (mode, src);
12748       }
12749       ldop_code = AARCH64_LDOP_BIC;
12750       break;
12751
12752     default:
12753       /* The operation can't be done with atomic instructions.  */
12754       gcc_unreachable ();
12755     }
12756
12757   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12758
12759   /* If necessary, calculate the data in memory after the update by redoing the
12760      operation from values in registers.  */
12761   if (!out_result)
12762     return;
12763
12764   if (short_mode)
12765     {
12766       src = gen_lowpart (wmode, src);
12767       out_data = gen_lowpart (wmode, out_data);
12768       out_result = gen_lowpart (wmode, out_result);
12769     }
12770
12771   x = NULL_RTX;
12772
12773   switch (code)
12774     {
12775     case MINUS:
12776     case PLUS:
12777       x = gen_rtx_PLUS (wmode, out_data, src);
12778       break;
12779     case IOR:
12780       x = gen_rtx_IOR (wmode, out_data, src);
12781       break;
12782     case XOR:
12783       x = gen_rtx_XOR (wmode, out_data, src);
12784       break;
12785     case AND:
12786       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12787       return;
12788     default:
12789       gcc_unreachable ();
12790     }
12791
12792   emit_set_insn (out_result, x);
12793
12794   return;
12795 }
12796
12797 /* Split an atomic operation.  */
12798
12799 void
12800 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12801                          rtx value, rtx model_rtx, rtx cond)
12802 {
12803   machine_mode mode = GET_MODE (mem);
12804   machine_mode wmode = (mode == DImode ? DImode : SImode);
12805   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12806   const bool is_sync = is_mm_sync (model);
12807   rtx_code_label *label;
12808   rtx x;
12809
12810   /* Split the atomic operation into a sequence.  */
12811   label = gen_label_rtx ();
12812   emit_label (label);
12813
12814   if (new_out)
12815     new_out = gen_lowpart (wmode, new_out);
12816   if (old_out)
12817     old_out = gen_lowpart (wmode, old_out);
12818   else
12819     old_out = new_out;
12820   value = simplify_gen_subreg (wmode, value, mode, 0);
12821
12822   /* The initial load can be relaxed for a __sync operation since a final
12823      barrier will be emitted to stop code hoisting.  */
12824  if (is_sync)
12825     aarch64_emit_load_exclusive (mode, old_out, mem,
12826                                  GEN_INT (MEMMODEL_RELAXED));
12827   else
12828     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12829
12830   switch (code)
12831     {
12832     case SET:
12833       new_out = value;
12834       break;
12835
12836     case NOT:
12837       x = gen_rtx_AND (wmode, old_out, value);
12838       emit_insn (gen_rtx_SET (new_out, x));
12839       x = gen_rtx_NOT (wmode, new_out);
12840       emit_insn (gen_rtx_SET (new_out, x));
12841       break;
12842
12843     case MINUS:
12844       if (CONST_INT_P (value))
12845         {
12846           value = GEN_INT (-INTVAL (value));
12847           code = PLUS;
12848         }
12849       /* Fall through.  */
12850
12851     default:
12852       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12853       emit_insn (gen_rtx_SET (new_out, x));
12854       break;
12855     }
12856
12857   aarch64_emit_store_exclusive (mode, cond, mem,
12858                                 gen_lowpart (mode, new_out), model_rtx);
12859
12860   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12861   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12862                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12863   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12864
12865   /* Emit any final barrier needed for a __sync operation.  */
12866   if (is_sync)
12867     aarch64_emit_post_barrier (model);
12868 }
12869
12870 static void
12871 aarch64_init_libfuncs (void)
12872 {
12873    /* Half-precision float operations.  The compiler handles all operations
12874      with NULL libfuncs by converting to SFmode.  */
12875
12876   /* Conversions.  */
12877   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12878   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12879
12880   /* Arithmetic.  */
12881   set_optab_libfunc (add_optab, HFmode, NULL);
12882   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12883   set_optab_libfunc (smul_optab, HFmode, NULL);
12884   set_optab_libfunc (neg_optab, HFmode, NULL);
12885   set_optab_libfunc (sub_optab, HFmode, NULL);
12886
12887   /* Comparisons.  */
12888   set_optab_libfunc (eq_optab, HFmode, NULL);
12889   set_optab_libfunc (ne_optab, HFmode, NULL);
12890   set_optab_libfunc (lt_optab, HFmode, NULL);
12891   set_optab_libfunc (le_optab, HFmode, NULL);
12892   set_optab_libfunc (ge_optab, HFmode, NULL);
12893   set_optab_libfunc (gt_optab, HFmode, NULL);
12894   set_optab_libfunc (unord_optab, HFmode, NULL);
12895 }
12896
12897 /* Target hook for c_mode_for_suffix.  */
12898 static machine_mode
12899 aarch64_c_mode_for_suffix (char suffix)
12900 {
12901   if (suffix == 'q')
12902     return TFmode;
12903
12904   return VOIDmode;
12905 }
12906
12907 /* We can only represent floating point constants which will fit in
12908    "quarter-precision" values.  These values are characterised by
12909    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12910    by:
12911
12912    (-1)^s * (n/16) * 2^r
12913
12914    Where:
12915      's' is the sign bit.
12916      'n' is an integer in the range 16 <= n <= 31.
12917      'r' is an integer in the range -3 <= r <= 4.  */
12918
12919 /* Return true iff X can be represented by a quarter-precision
12920    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12921 bool
12922 aarch64_float_const_representable_p (rtx x)
12923 {
12924   /* This represents our current view of how many bits
12925      make up the mantissa.  */
12926   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12927   int exponent;
12928   unsigned HOST_WIDE_INT mantissa, mask;
12929   REAL_VALUE_TYPE r, m;
12930   bool fail;
12931
12932   if (!CONST_DOUBLE_P (x))
12933     return false;
12934
12935   /* We don't support HFmode constants yet.  */
12936   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12937     return false;
12938
12939   r = *CONST_DOUBLE_REAL_VALUE (x);
12940
12941   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12942      know if we have +zero until we analyse the mantissa, but we
12943      can reject the other invalid values.  */
12944   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12945       || REAL_VALUE_MINUS_ZERO (r))
12946     return false;
12947
12948   /* Extract exponent.  */
12949   r = real_value_abs (&r);
12950   exponent = REAL_EXP (&r);
12951
12952   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12953      highest (sign) bit, with a fixed binary point at bit point_pos.
12954      m1 holds the low part of the mantissa, m2 the high part.
12955      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12956      bits for the mantissa, this can fail (low bits will be lost).  */
12957   real_ldexp (&m, &r, point_pos - exponent);
12958   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12959
12960   /* If the low part of the mantissa has bits set we cannot represent
12961      the value.  */
12962   if (w.ulow () != 0)
12963     return false;
12964   /* We have rejected the lower HOST_WIDE_INT, so update our
12965      understanding of how many bits lie in the mantissa and
12966      look only at the high HOST_WIDE_INT.  */
12967   mantissa = w.elt (1);
12968   point_pos -= HOST_BITS_PER_WIDE_INT;
12969
12970   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12971   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12972   if ((mantissa & mask) != 0)
12973     return false;
12974
12975   /* Having filtered unrepresentable values, we may now remove all
12976      but the highest 5 bits.  */
12977   mantissa >>= point_pos - 5;
12978
12979   /* We cannot represent the value 0.0, so reject it.  This is handled
12980      elsewhere.  */
12981   if (mantissa == 0)
12982     return false;
12983
12984   /* Then, as bit 4 is always set, we can mask it off, leaving
12985      the mantissa in the range [0, 15].  */
12986   mantissa &= ~(1 << 4);
12987   gcc_assert (mantissa <= 15);
12988
12989   /* GCC internally does not use IEEE754-like encoding (where normalized
12990      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12991      Our mantissa values are shifted 4 places to the left relative to
12992      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12993      by 5 places to correct for GCC's representation.  */
12994   exponent = 5 - exponent;
12995
12996   return (exponent >= 0 && exponent <= 7);
12997 }
12998
12999 char*
13000 aarch64_output_simd_mov_immediate (rtx const_vector,
13001                                    machine_mode mode,
13002                                    unsigned width)
13003 {
13004   bool is_valid;
13005   static char templ[40];
13006   const char *mnemonic;
13007   const char *shift_op;
13008   unsigned int lane_count = 0;
13009   char element_char;
13010
13011   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13012
13013   /* This will return true to show const_vector is legal for use as either
13014      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
13015      also update INFO to show how the immediate should be generated.  */
13016   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13017   gcc_assert (is_valid);
13018
13019   element_char = sizetochar (info.element_width);
13020   lane_count = width / info.element_width;
13021
13022   mode = GET_MODE_INNER (mode);
13023   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13024     {
13025       gcc_assert (info.shift == 0 && ! info.mvn);
13026       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13027          move immediate path.  */
13028       if (aarch64_float_const_zero_rtx_p (info.value))
13029         info.value = GEN_INT (0);
13030       else
13031         {
13032           const unsigned int buf_size = 20;
13033           char float_buf[buf_size] = {'\0'};
13034           real_to_decimal_for_mode (float_buf,
13035                                     CONST_DOUBLE_REAL_VALUE (info.value),
13036                                     buf_size, buf_size, 1, mode);
13037
13038           if (lane_count == 1)
13039             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13040           else
13041             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13042                       lane_count, element_char, float_buf);
13043           return templ;
13044         }
13045     }
13046
13047   mnemonic = info.mvn ? "mvni" : "movi";
13048   shift_op = info.msl ? "msl" : "lsl";
13049
13050   gcc_assert (CONST_INT_P (info.value));
13051   if (lane_count == 1)
13052     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13053               mnemonic, UINTVAL (info.value));
13054   else if (info.shift)
13055     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13056               ", %s %d", mnemonic, lane_count, element_char,
13057               UINTVAL (info.value), shift_op, info.shift);
13058   else
13059     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13060               mnemonic, lane_count, element_char, UINTVAL (info.value));
13061   return templ;
13062 }
13063
13064 char*
13065 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13066 {
13067
13068   /* If a floating point number was passed and we desire to use it in an
13069      integer mode do the conversion to integer.  */
13070   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13071     {
13072       unsigned HOST_WIDE_INT ival;
13073       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13074           gcc_unreachable ();
13075       immediate = gen_int_mode (ival, mode);
13076     }
13077
13078   machine_mode vmode;
13079   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13080      a 128 bit vector mode.  */
13081   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13082
13083   vmode = aarch64_simd_container_mode (mode, width);
13084   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13085   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13086 }
13087
13088 /* Split operands into moves from op[1] + op[2] into op[0].  */
13089
13090 void
13091 aarch64_split_combinev16qi (rtx operands[3])
13092 {
13093   unsigned int dest = REGNO (operands[0]);
13094   unsigned int src1 = REGNO (operands[1]);
13095   unsigned int src2 = REGNO (operands[2]);
13096   machine_mode halfmode = GET_MODE (operands[1]);
13097   unsigned int halfregs = REG_NREGS (operands[1]);
13098   rtx destlo, desthi;
13099
13100   gcc_assert (halfmode == V16QImode);
13101
13102   if (src1 == dest && src2 == dest + halfregs)
13103     {
13104       /* No-op move.  Can't split to nothing; emit something.  */
13105       emit_note (NOTE_INSN_DELETED);
13106       return;
13107     }
13108
13109   /* Preserve register attributes for variable tracking.  */
13110   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13111   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13112                                GET_MODE_SIZE (halfmode));
13113
13114   /* Special case of reversed high/low parts.  */
13115   if (reg_overlap_mentioned_p (operands[2], destlo)
13116       && reg_overlap_mentioned_p (operands[1], desthi))
13117     {
13118       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13119       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13120       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13121     }
13122   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13123     {
13124       /* Try to avoid unnecessary moves if part of the result
13125          is in the right place already.  */
13126       if (src1 != dest)
13127         emit_move_insn (destlo, operands[1]);
13128       if (src2 != dest + halfregs)
13129         emit_move_insn (desthi, operands[2]);
13130     }
13131   else
13132     {
13133       if (src2 != dest + halfregs)
13134         emit_move_insn (desthi, operands[2]);
13135       if (src1 != dest)
13136         emit_move_insn (destlo, operands[1]);
13137     }
13138 }
13139
13140 /* vec_perm support.  */
13141
13142 #define MAX_VECT_LEN 16
13143
13144 struct expand_vec_perm_d
13145 {
13146   rtx target, op0, op1;
13147   unsigned char perm[MAX_VECT_LEN];
13148   machine_mode vmode;
13149   unsigned char nelt;
13150   bool one_vector_p;
13151   bool testing_p;
13152 };
13153
13154 /* Generate a variable permutation.  */
13155
13156 static void
13157 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13158 {
13159   machine_mode vmode = GET_MODE (target);
13160   bool one_vector_p = rtx_equal_p (op0, op1);
13161
13162   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13163   gcc_checking_assert (GET_MODE (op0) == vmode);
13164   gcc_checking_assert (GET_MODE (op1) == vmode);
13165   gcc_checking_assert (GET_MODE (sel) == vmode);
13166   gcc_checking_assert (TARGET_SIMD);
13167
13168   if (one_vector_p)
13169     {
13170       if (vmode == V8QImode)
13171         {
13172           /* Expand the argument to a V16QI mode by duplicating it.  */
13173           rtx pair = gen_reg_rtx (V16QImode);
13174           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13175           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13176         }
13177       else
13178         {
13179           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13180         }
13181     }
13182   else
13183     {
13184       rtx pair;
13185
13186       if (vmode == V8QImode)
13187         {
13188           pair = gen_reg_rtx (V16QImode);
13189           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13190           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13191         }
13192       else
13193         {
13194           pair = gen_reg_rtx (OImode);
13195           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13196           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13197         }
13198     }
13199 }
13200
13201 void
13202 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13203 {
13204   machine_mode vmode = GET_MODE (target);
13205   unsigned int nelt = GET_MODE_NUNITS (vmode);
13206   bool one_vector_p = rtx_equal_p (op0, op1);
13207   rtx mask;
13208
13209   /* The TBL instruction does not use a modulo index, so we must take care
13210      of that ourselves.  */
13211   mask = aarch64_simd_gen_const_vector_dup (vmode,
13212       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13213   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13214
13215   /* For big-endian, we also need to reverse the index within the vector
13216      (but not which vector).  */
13217   if (BYTES_BIG_ENDIAN)
13218     {
13219       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13220       if (!one_vector_p)
13221         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13222       sel = expand_simple_binop (vmode, XOR, sel, mask,
13223                                  NULL, 0, OPTAB_LIB_WIDEN);
13224     }
13225   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13226 }
13227
13228 /* Recognize patterns suitable for the TRN instructions.  */
13229 static bool
13230 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13231 {
13232   unsigned int i, odd, mask, nelt = d->nelt;
13233   rtx out, in0, in1, x;
13234   rtx (*gen) (rtx, rtx, rtx);
13235   machine_mode vmode = d->vmode;
13236
13237   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13238     return false;
13239
13240   /* Note that these are little-endian tests.
13241      We correct for big-endian later.  */
13242   if (d->perm[0] == 0)
13243     odd = 0;
13244   else if (d->perm[0] == 1)
13245     odd = 1;
13246   else
13247     return false;
13248   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13249
13250   for (i = 0; i < nelt; i += 2)
13251     {
13252       if (d->perm[i] != i + odd)
13253         return false;
13254       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13255         return false;
13256     }
13257
13258   /* Success!  */
13259   if (d->testing_p)
13260     return true;
13261
13262   in0 = d->op0;
13263   in1 = d->op1;
13264   if (BYTES_BIG_ENDIAN)
13265     {
13266       x = in0, in0 = in1, in1 = x;
13267       odd = !odd;
13268     }
13269   out = d->target;
13270
13271   if (odd)
13272     {
13273       switch (vmode)
13274         {
13275         case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13276         case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13277         case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13278         case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13279         case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13280         case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13281         case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13282         case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13283         case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13284         case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13285         case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13286         case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13287         default:
13288           return false;
13289         }
13290     }
13291   else
13292     {
13293       switch (vmode)
13294         {
13295         case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13296         case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13297         case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13298         case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13299         case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13300         case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13301         case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13302         case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13303         case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13304         case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13305         case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13306         case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13307         default:
13308           return false;
13309         }
13310     }
13311
13312   emit_insn (gen (out, in0, in1));
13313   return true;
13314 }
13315
13316 /* Recognize patterns suitable for the UZP instructions.  */
13317 static bool
13318 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13319 {
13320   unsigned int i, odd, mask, nelt = d->nelt;
13321   rtx out, in0, in1, x;
13322   rtx (*gen) (rtx, rtx, rtx);
13323   machine_mode vmode = d->vmode;
13324
13325   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13326     return false;
13327
13328   /* Note that these are little-endian tests.
13329      We correct for big-endian later.  */
13330   if (d->perm[0] == 0)
13331     odd = 0;
13332   else if (d->perm[0] == 1)
13333     odd = 1;
13334   else
13335     return false;
13336   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13337
13338   for (i = 0; i < nelt; i++)
13339     {
13340       unsigned elt = (i * 2 + odd) & mask;
13341       if (d->perm[i] != elt)
13342         return false;
13343     }
13344
13345   /* Success!  */
13346   if (d->testing_p)
13347     return true;
13348
13349   in0 = d->op0;
13350   in1 = d->op1;
13351   if (BYTES_BIG_ENDIAN)
13352     {
13353       x = in0, in0 = in1, in1 = x;
13354       odd = !odd;
13355     }
13356   out = d->target;
13357
13358   if (odd)
13359     {
13360       switch (vmode)
13361         {
13362         case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13363         case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13364         case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13365         case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13366         case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13367         case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13368         case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13369         case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13370         case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13371         case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13372         case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13373         case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13374         default:
13375           return false;
13376         }
13377     }
13378   else
13379     {
13380       switch (vmode)
13381         {
13382         case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13383         case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13384         case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13385         case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13386         case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13387         case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13388         case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13389         case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13390         case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13391         case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13392         case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13393         case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13394         default:
13395           return false;
13396         }
13397     }
13398
13399   emit_insn (gen (out, in0, in1));
13400   return true;
13401 }
13402
13403 /* Recognize patterns suitable for the ZIP instructions.  */
13404 static bool
13405 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13406 {
13407   unsigned int i, high, mask, nelt = d->nelt;
13408   rtx out, in0, in1, x;
13409   rtx (*gen) (rtx, rtx, rtx);
13410   machine_mode vmode = d->vmode;
13411
13412   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13413     return false;
13414
13415   /* Note that these are little-endian tests.
13416      We correct for big-endian later.  */
13417   high = nelt / 2;
13418   if (d->perm[0] == high)
13419     /* Do Nothing.  */
13420     ;
13421   else if (d->perm[0] == 0)
13422     high = 0;
13423   else
13424     return false;
13425   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13426
13427   for (i = 0; i < nelt / 2; i++)
13428     {
13429       unsigned elt = (i + high) & mask;
13430       if (d->perm[i * 2] != elt)
13431         return false;
13432       elt = (elt + nelt) & mask;
13433       if (d->perm[i * 2 + 1] != elt)
13434         return false;
13435     }
13436
13437   /* Success!  */
13438   if (d->testing_p)
13439     return true;
13440
13441   in0 = d->op0;
13442   in1 = d->op1;
13443   if (BYTES_BIG_ENDIAN)
13444     {
13445       x = in0, in0 = in1, in1 = x;
13446       high = !high;
13447     }
13448   out = d->target;
13449
13450   if (high)
13451     {
13452       switch (vmode)
13453         {
13454         case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13455         case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13456         case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13457         case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13458         case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13459         case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13460         case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13461         case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13462         case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13463         case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13464         case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13465         case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13466         default:
13467           return false;
13468         }
13469     }
13470   else
13471     {
13472       switch (vmode)
13473         {
13474         case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13475         case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13476         case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13477         case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13478         case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13479         case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13480         case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13481         case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13482         case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13483         case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13484         case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13485         case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13486         default:
13487           return false;
13488         }
13489     }
13490
13491   emit_insn (gen (out, in0, in1));
13492   return true;
13493 }
13494
13495 /* Recognize patterns for the EXT insn.  */
13496
13497 static bool
13498 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13499 {
13500   unsigned int i, nelt = d->nelt;
13501   rtx (*gen) (rtx, rtx, rtx, rtx);
13502   rtx offset;
13503
13504   unsigned int location = d->perm[0]; /* Always < nelt.  */
13505
13506   /* Check if the extracted indices are increasing by one.  */
13507   for (i = 1; i < nelt; i++)
13508     {
13509       unsigned int required = location + i;
13510       if (d->one_vector_p)
13511         {
13512           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13513           required &= (nelt - 1);
13514         }
13515       if (d->perm[i] != required)
13516         return false;
13517     }
13518
13519   switch (d->vmode)
13520     {
13521     case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13522     case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13523     case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13524     case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13525     case E_V2SImode: gen = gen_aarch64_extv2si; break;
13526     case E_V4SImode: gen = gen_aarch64_extv4si; break;
13527     case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13528     case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13529     case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13530     case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13531     case E_V2DImode: gen = gen_aarch64_extv2di; break;
13532     case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13533     default:
13534       return false;
13535     }
13536
13537   /* Success! */
13538   if (d->testing_p)
13539     return true;
13540
13541   /* The case where (location == 0) is a no-op for both big- and little-endian,
13542      and is removed by the mid-end at optimization levels -O1 and higher.  */
13543
13544   if (BYTES_BIG_ENDIAN && (location != 0))
13545     {
13546       /* After setup, we want the high elements of the first vector (stored
13547          at the LSB end of the register), and the low elements of the second
13548          vector (stored at the MSB end of the register). So swap.  */
13549       std::swap (d->op0, d->op1);
13550       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13551       location = nelt - location;
13552     }
13553
13554   offset = GEN_INT (location);
13555   emit_insn (gen (d->target, d->op0, d->op1, offset));
13556   return true;
13557 }
13558
13559 /* Recognize patterns for the REV insns.  */
13560
13561 static bool
13562 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13563 {
13564   unsigned int i, j, diff, nelt = d->nelt;
13565   rtx (*gen) (rtx, rtx);
13566
13567   if (!d->one_vector_p)
13568     return false;
13569
13570   diff = d->perm[0];
13571   switch (diff)
13572     {
13573     case 7:
13574       switch (d->vmode)
13575         {
13576         case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13577         case E_V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13578         default:
13579           return false;
13580         }
13581       break;
13582     case 3:
13583       switch (d->vmode)
13584         {
13585         case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13586         case E_V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13587         case E_V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13588         case E_V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13589         default:
13590           return false;
13591         }
13592       break;
13593     case 1:
13594       switch (d->vmode)
13595         {
13596         case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13597         case E_V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13598         case E_V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13599         case E_V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13600         case E_V4SImode: gen = gen_aarch64_rev64v4si;  break;
13601         case E_V2SImode: gen = gen_aarch64_rev64v2si;  break;
13602         case E_V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13603         case E_V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13604         case E_V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13605         case E_V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13606         default:
13607           return false;
13608         }
13609       break;
13610     default:
13611       return false;
13612     }
13613
13614   for (i = 0; i < nelt ; i += diff + 1)
13615     for (j = 0; j <= diff; j += 1)
13616       {
13617         /* This is guaranteed to be true as the value of diff
13618            is 7, 3, 1 and we should have enough elements in the
13619            queue to generate this.  Getting a vector mask with a
13620            value of diff other than these values implies that
13621            something is wrong by the time we get here.  */
13622         gcc_assert (i + j < nelt);
13623         if (d->perm[i + j] != i + diff - j)
13624           return false;
13625       }
13626
13627   /* Success! */
13628   if (d->testing_p)
13629     return true;
13630
13631   emit_insn (gen (d->target, d->op0));
13632   return true;
13633 }
13634
13635 static bool
13636 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13637 {
13638   rtx (*gen) (rtx, rtx, rtx);
13639   rtx out = d->target;
13640   rtx in0;
13641   machine_mode vmode = d->vmode;
13642   unsigned int i, elt, nelt = d->nelt;
13643   rtx lane;
13644
13645   elt = d->perm[0];
13646   for (i = 1; i < nelt; i++)
13647     {
13648       if (elt != d->perm[i])
13649         return false;
13650     }
13651
13652   /* The generic preparation in aarch64_expand_vec_perm_const_1
13653      swaps the operand order and the permute indices if it finds
13654      d->perm[0] to be in the second operand.  Thus, we can always
13655      use d->op0 and need not do any extra arithmetic to get the
13656      correct lane number.  */
13657   in0 = d->op0;
13658   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13659
13660   switch (vmode)
13661     {
13662     case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13663     case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13664     case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13665     case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13666     case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13667     case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13668     case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13669     case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13670     case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13671     case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13672     case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13673     case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13674     default:
13675       return false;
13676     }
13677
13678   emit_insn (gen (out, in0, lane));
13679   return true;
13680 }
13681
13682 static bool
13683 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13684 {
13685   rtx rperm[MAX_VECT_LEN], sel;
13686   machine_mode vmode = d->vmode;
13687   unsigned int i, nelt = d->nelt;
13688
13689   if (d->testing_p)
13690     return true;
13691
13692   /* Generic code will try constant permutation twice.  Once with the
13693      original mode and again with the elements lowered to QImode.
13694      So wait and don't do the selector expansion ourselves.  */
13695   if (vmode != V8QImode && vmode != V16QImode)
13696     return false;
13697
13698   for (i = 0; i < nelt; ++i)
13699     {
13700       int nunits = GET_MODE_NUNITS (vmode);
13701
13702       /* If big-endian and two vectors we end up with a weird mixed-endian
13703          mode on NEON.  Reverse the index within each word but not the word
13704          itself.  */
13705       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13706                                            : d->perm[i]);
13707     }
13708   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13709   sel = force_reg (vmode, sel);
13710
13711   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13712   return true;
13713 }
13714
13715 static bool
13716 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13717 {
13718   /* The pattern matching functions above are written to look for a small
13719      number to begin the sequence (0, 1, N/2).  If we begin with an index
13720      from the second operand, we can swap the operands.  */
13721   if (d->perm[0] >= d->nelt)
13722     {
13723       unsigned i, nelt = d->nelt;
13724
13725       gcc_assert (nelt == (nelt & -nelt));
13726       for (i = 0; i < nelt; ++i)
13727         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13728
13729       std::swap (d->op0, d->op1);
13730     }
13731
13732   if (TARGET_SIMD)
13733     {
13734       if (aarch64_evpc_rev (d))
13735         return true;
13736       else if (aarch64_evpc_ext (d))
13737         return true;
13738       else if (aarch64_evpc_dup (d))
13739         return true;
13740       else if (aarch64_evpc_zip (d))
13741         return true;
13742       else if (aarch64_evpc_uzp (d))
13743         return true;
13744       else if (aarch64_evpc_trn (d))
13745         return true;
13746       return aarch64_evpc_tbl (d);
13747     }
13748   return false;
13749 }
13750
13751 /* Expand a vec_perm_const pattern.  */
13752
13753 bool
13754 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13755 {
13756   struct expand_vec_perm_d d;
13757   int i, nelt, which;
13758
13759   d.target = target;
13760   d.op0 = op0;
13761   d.op1 = op1;
13762
13763   d.vmode = GET_MODE (target);
13764   gcc_assert (VECTOR_MODE_P (d.vmode));
13765   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13766   d.testing_p = false;
13767
13768   for (i = which = 0; i < nelt; ++i)
13769     {
13770       rtx e = XVECEXP (sel, 0, i);
13771       int ei = INTVAL (e) & (2 * nelt - 1);
13772       which |= (ei < nelt ? 1 : 2);
13773       d.perm[i] = ei;
13774     }
13775
13776   switch (which)
13777     {
13778     default:
13779       gcc_unreachable ();
13780
13781     case 3:
13782       d.one_vector_p = false;
13783       if (!rtx_equal_p (op0, op1))
13784         break;
13785
13786       /* The elements of PERM do not suggest that only the first operand
13787          is used, but both operands are identical.  Allow easier matching
13788          of the permutation by folding the permutation into the single
13789          input vector.  */
13790       /* Fall Through.  */
13791     case 2:
13792       for (i = 0; i < nelt; ++i)
13793         d.perm[i] &= nelt - 1;
13794       d.op0 = op1;
13795       d.one_vector_p = true;
13796       break;
13797
13798     case 1:
13799       d.op1 = op0;
13800       d.one_vector_p = true;
13801       break;
13802     }
13803
13804   return aarch64_expand_vec_perm_const_1 (&d);
13805 }
13806
13807 static bool
13808 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13809                                      const unsigned char *sel)
13810 {
13811   struct expand_vec_perm_d d;
13812   unsigned int i, nelt, which;
13813   bool ret;
13814
13815   d.vmode = vmode;
13816   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13817   d.testing_p = true;
13818   memcpy (d.perm, sel, nelt);
13819
13820   /* Calculate whether all elements are in one vector.  */
13821   for (i = which = 0; i < nelt; ++i)
13822     {
13823       unsigned char e = d.perm[i];
13824       gcc_assert (e < 2 * nelt);
13825       which |= (e < nelt ? 1 : 2);
13826     }
13827
13828   /* If all elements are from the second vector, reindex as if from the
13829      first vector.  */
13830   if (which == 2)
13831     for (i = 0; i < nelt; ++i)
13832       d.perm[i] -= nelt;
13833
13834   /* Check whether the mask can be applied to a single vector.  */
13835   d.one_vector_p = (which != 3);
13836
13837   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13838   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13839   if (!d.one_vector_p)
13840     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13841
13842   start_sequence ();
13843   ret = aarch64_expand_vec_perm_const_1 (&d);
13844   end_sequence ();
13845
13846   return ret;
13847 }
13848
13849 rtx
13850 aarch64_reverse_mask (machine_mode mode)
13851 {
13852   /* We have to reverse each vector because we dont have
13853      a permuted load that can reverse-load according to ABI rules.  */
13854   rtx mask;
13855   rtvec v = rtvec_alloc (16);
13856   int i, j;
13857   int nunits = GET_MODE_NUNITS (mode);
13858   int usize = GET_MODE_UNIT_SIZE (mode);
13859
13860   gcc_assert (BYTES_BIG_ENDIAN);
13861   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13862
13863   for (i = 0; i < nunits; i++)
13864     for (j = 0; j < usize; j++)
13865       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13866   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13867   return force_reg (V16QImode, mask);
13868 }
13869
13870 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
13871    true.  However due to issues with register allocation it is preferable
13872    to avoid tieing integer scalar and FP scalar modes.  Executing integer
13873    operations in general registers is better than treating them as scalar
13874    vector operations.  This reduces latency and avoids redundant int<->FP
13875    moves.  So tie modes if they are either the same class, or vector modes
13876    with other vector modes, vector structs or any scalar mode.  */
13877
13878 static bool
13879 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13880 {
13881   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13882     return true;
13883
13884   /* We specifically want to allow elements of "structure" modes to
13885      be tieable to the structure.  This more general condition allows
13886      other rarer situations too.  */
13887   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13888     return true;
13889
13890   /* Also allow any scalar modes with vectors.  */
13891   if (aarch64_vector_mode_supported_p (mode1)
13892       || aarch64_vector_mode_supported_p (mode2))
13893     return true;
13894
13895   return false;
13896 }
13897
13898 /* Return a new RTX holding the result of moving POINTER forward by
13899    AMOUNT bytes.  */
13900
13901 static rtx
13902 aarch64_move_pointer (rtx pointer, int amount)
13903 {
13904   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13905
13906   return adjust_automodify_address (pointer, GET_MODE (pointer),
13907                                     next, amount);
13908 }
13909
13910 /* Return a new RTX holding the result of moving POINTER forward by the
13911    size of the mode it points to.  */
13912
13913 static rtx
13914 aarch64_progress_pointer (rtx pointer)
13915 {
13916   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13917
13918   return aarch64_move_pointer (pointer, amount);
13919 }
13920
13921 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13922    MODE bytes.  */
13923
13924 static void
13925 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13926                                               machine_mode mode)
13927 {
13928   rtx reg = gen_reg_rtx (mode);
13929
13930   /* "Cast" the pointers to the correct mode.  */
13931   *src = adjust_address (*src, mode, 0);
13932   *dst = adjust_address (*dst, mode, 0);
13933   /* Emit the memcpy.  */
13934   emit_move_insn (reg, *src);
13935   emit_move_insn (*dst, reg);
13936   /* Move the pointers forward.  */
13937   *src = aarch64_progress_pointer (*src);
13938   *dst = aarch64_progress_pointer (*dst);
13939 }
13940
13941 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13942    we succeed, otherwise return false.  */
13943
13944 bool
13945 aarch64_expand_movmem (rtx *operands)
13946 {
13947   unsigned int n;
13948   rtx dst = operands[0];
13949   rtx src = operands[1];
13950   rtx base;
13951   bool speed_p = !optimize_function_for_size_p (cfun);
13952
13953   /* When optimizing for size, give a better estimate of the length of a
13954      memcpy call, but use the default otherwise.  */
13955   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13956
13957   /* We can't do anything smart if the amount to copy is not constant.  */
13958   if (!CONST_INT_P (operands[2]))
13959     return false;
13960
13961   n = UINTVAL (operands[2]);
13962
13963   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13964      need to make at most two moves.  For cases above 16 bytes it will be one
13965      move for each 16 byte chunk, then at most two additional moves.  */
13966   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13967     return false;
13968
13969   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13970   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13971
13972   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13973   src = adjust_automodify_address (src, VOIDmode, base, 0);
13974
13975   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13976      1-byte chunk.  */
13977   if (n < 4)
13978     {
13979       if (n >= 2)
13980         {
13981           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13982           n -= 2;
13983         }
13984
13985       if (n == 1)
13986         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13987
13988       return true;
13989     }
13990
13991   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13992      4-byte chunk, partially overlapping with the previously copied chunk.  */
13993   if (n < 8)
13994     {
13995       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13996       n -= 4;
13997       if (n > 0)
13998         {
13999           int move = n - 4;
14000
14001           src = aarch64_move_pointer (src, move);
14002           dst = aarch64_move_pointer (dst, move);
14003           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14004         }
14005       return true;
14006     }
14007
14008   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
14009      them, then (if applicable) an 8-byte chunk.  */
14010   while (n >= 8)
14011     {
14012       if (n / 16)
14013         {
14014           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14015           n -= 16;
14016         }
14017       else
14018         {
14019           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14020           n -= 8;
14021         }
14022     }
14023
14024   /* Finish the final bytes of the copy.  We can always do this in one
14025      instruction.  We either copy the exact amount we need, or partially
14026      overlap with the previous chunk we copied and copy 8-bytes.  */
14027   if (n == 0)
14028     return true;
14029   else if (n == 1)
14030     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14031   else if (n == 2)
14032     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14033   else if (n == 4)
14034     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14035   else
14036     {
14037       if (n == 3)
14038         {
14039           src = aarch64_move_pointer (src, -1);
14040           dst = aarch64_move_pointer (dst, -1);
14041           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14042         }
14043       else
14044         {
14045           int move = n - 8;
14046
14047           src = aarch64_move_pointer (src, move);
14048           dst = aarch64_move_pointer (dst, move);
14049           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14050         }
14051     }
14052
14053   return true;
14054 }
14055
14056 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14057    SImode stores.  Handle the case when the constant has identical
14058    bottom and top halves.  This is beneficial when the two stores can be
14059    merged into an STP and we avoid synthesising potentially expensive
14060    immediates twice.  Return true if such a split is possible.  */
14061
14062 bool
14063 aarch64_split_dimode_const_store (rtx dst, rtx src)
14064 {
14065   rtx lo = gen_lowpart (SImode, src);
14066   rtx hi = gen_highpart_mode (SImode, DImode, src);
14067
14068   bool size_p = optimize_function_for_size_p (cfun);
14069
14070   if (!rtx_equal_p (lo, hi))
14071     return false;
14072
14073   unsigned int orig_cost
14074     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14075   unsigned int lo_cost
14076     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14077
14078   /* We want to transform:
14079      MOV        x1, 49370
14080      MOVK       x1, 0x140, lsl 16
14081      MOVK       x1, 0xc0da, lsl 32
14082      MOVK       x1, 0x140, lsl 48
14083      STR        x1, [x0]
14084    into:
14085      MOV        w1, 49370
14086      MOVK       w1, 0x140, lsl 16
14087      STP        w1, w1, [x0]
14088    So we want to perform this only when we save two instructions
14089    or more.  When optimizing for size, however, accept any code size
14090    savings we can.  */
14091   if (size_p && orig_cost <= lo_cost)
14092     return false;
14093
14094   if (!size_p
14095       && (orig_cost <= lo_cost + 1))
14096     return false;
14097
14098   rtx mem_lo = adjust_address (dst, SImode, 0);
14099   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14100     return false;
14101
14102   rtx tmp_reg = gen_reg_rtx (SImode);
14103   aarch64_expand_mov_immediate (tmp_reg, lo);
14104   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14105   /* Don't emit an explicit store pair as this may not be always profitable.
14106      Let the sched-fusion logic decide whether to merge them.  */
14107   emit_move_insn (mem_lo, tmp_reg);
14108   emit_move_insn (mem_hi, tmp_reg);
14109
14110   return true;
14111 }
14112
14113 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14114
14115 static unsigned HOST_WIDE_INT
14116 aarch64_asan_shadow_offset (void)
14117 {
14118   return (HOST_WIDE_INT_1 << 36);
14119 }
14120
14121 static bool
14122 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14123                                         unsigned int align,
14124                                         enum by_pieces_operation op,
14125                                         bool speed_p)
14126 {
14127   /* STORE_BY_PIECES can be used when copying a constant string, but
14128      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14129      For now we always fail this and let the move_by_pieces code copy
14130      the string from read-only memory.  */
14131   if (op == STORE_BY_PIECES)
14132     return false;
14133
14134   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14135 }
14136
14137 static rtx
14138 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14139                         int code, tree treeop0, tree treeop1)
14140 {
14141   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14142   rtx op0, op1;
14143   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14144   insn_code icode;
14145   struct expand_operand ops[4];
14146
14147   start_sequence ();
14148   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14149
14150   op_mode = GET_MODE (op0);
14151   if (op_mode == VOIDmode)
14152     op_mode = GET_MODE (op1);
14153
14154   switch (op_mode)
14155     {
14156     case E_QImode:
14157     case E_HImode:
14158     case E_SImode:
14159       cmp_mode = SImode;
14160       icode = CODE_FOR_cmpsi;
14161       break;
14162
14163     case E_DImode:
14164       cmp_mode = DImode;
14165       icode = CODE_FOR_cmpdi;
14166       break;
14167
14168     case E_SFmode:
14169       cmp_mode = SFmode;
14170       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14171       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14172       break;
14173
14174     case E_DFmode:
14175       cmp_mode = DFmode;
14176       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14177       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14178       break;
14179
14180     default:
14181       end_sequence ();
14182       return NULL_RTX;
14183     }
14184
14185   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14186   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14187   if (!op0 || !op1)
14188     {
14189       end_sequence ();
14190       return NULL_RTX;
14191     }
14192   *prep_seq = get_insns ();
14193   end_sequence ();
14194
14195   create_fixed_operand (&ops[0], op0);
14196   create_fixed_operand (&ops[1], op1);
14197
14198   start_sequence ();
14199   if (!maybe_expand_insn (icode, 2, ops))
14200     {
14201       end_sequence ();
14202       return NULL_RTX;
14203     }
14204   *gen_seq = get_insns ();
14205   end_sequence ();
14206
14207   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14208                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14209 }
14210
14211 static rtx
14212 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14213                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14214 {
14215   rtx op0, op1, target;
14216   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14217   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14218   insn_code icode;
14219   struct expand_operand ops[6];
14220   int aarch64_cond;
14221
14222   push_to_sequence (*prep_seq);
14223   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14224
14225   op_mode = GET_MODE (op0);
14226   if (op_mode == VOIDmode)
14227     op_mode = GET_MODE (op1);
14228
14229   switch (op_mode)
14230     {
14231     case E_QImode:
14232     case E_HImode:
14233     case E_SImode:
14234       cmp_mode = SImode;
14235       icode = CODE_FOR_ccmpsi;
14236       break;
14237
14238     case E_DImode:
14239       cmp_mode = DImode;
14240       icode = CODE_FOR_ccmpdi;
14241       break;
14242
14243     case E_SFmode:
14244       cmp_mode = SFmode;
14245       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14246       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14247       break;
14248
14249     case E_DFmode:
14250       cmp_mode = DFmode;
14251       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14252       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14253       break;
14254
14255     default:
14256       end_sequence ();
14257       return NULL_RTX;
14258     }
14259
14260   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14261   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14262   if (!op0 || !op1)
14263     {
14264       end_sequence ();
14265       return NULL_RTX;
14266     }
14267   *prep_seq = get_insns ();
14268   end_sequence ();
14269
14270   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14271   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14272
14273   if (bit_code != AND)
14274     {
14275       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14276                                                 GET_MODE (XEXP (prev, 0))),
14277                              VOIDmode, XEXP (prev, 0), const0_rtx);
14278       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14279     }
14280
14281   create_fixed_operand (&ops[0], XEXP (prev, 0));
14282   create_fixed_operand (&ops[1], target);
14283   create_fixed_operand (&ops[2], op0);
14284   create_fixed_operand (&ops[3], op1);
14285   create_fixed_operand (&ops[4], prev);
14286   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14287
14288   push_to_sequence (*gen_seq);
14289   if (!maybe_expand_insn (icode, 6, ops))
14290     {
14291       end_sequence ();
14292       return NULL_RTX;
14293     }
14294
14295   *gen_seq = get_insns ();
14296   end_sequence ();
14297
14298   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14299 }
14300
14301 #undef TARGET_GEN_CCMP_FIRST
14302 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14303
14304 #undef TARGET_GEN_CCMP_NEXT
14305 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14306
14307 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14308    instruction fusion of some sort.  */
14309
14310 static bool
14311 aarch64_macro_fusion_p (void)
14312 {
14313   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14314 }
14315
14316
14317 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14318    should be kept together during scheduling.  */
14319
14320 static bool
14321 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14322 {
14323   rtx set_dest;
14324   rtx prev_set = single_set (prev);
14325   rtx curr_set = single_set (curr);
14326   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14327   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14328
14329   if (!aarch64_macro_fusion_p ())
14330     return false;
14331
14332   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14333     {
14334       /* We are trying to match:
14335          prev (mov)  == (set (reg r0) (const_int imm16))
14336          curr (movk) == (set (zero_extract (reg r0)
14337                                            (const_int 16)
14338                                            (const_int 16))
14339                              (const_int imm16_1))  */
14340
14341       set_dest = SET_DEST (curr_set);
14342
14343       if (GET_CODE (set_dest) == ZERO_EXTRACT
14344           && CONST_INT_P (SET_SRC (curr_set))
14345           && CONST_INT_P (SET_SRC (prev_set))
14346           && CONST_INT_P (XEXP (set_dest, 2))
14347           && INTVAL (XEXP (set_dest, 2)) == 16
14348           && REG_P (XEXP (set_dest, 0))
14349           && REG_P (SET_DEST (prev_set))
14350           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14351         {
14352           return true;
14353         }
14354     }
14355
14356   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14357     {
14358
14359       /*  We're trying to match:
14360           prev (adrp) == (set (reg r1)
14361                               (high (symbol_ref ("SYM"))))
14362           curr (add) == (set (reg r0)
14363                              (lo_sum (reg r1)
14364                                      (symbol_ref ("SYM"))))
14365           Note that r0 need not necessarily be the same as r1, especially
14366           during pre-regalloc scheduling.  */
14367
14368       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14369           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14370         {
14371           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14372               && REG_P (XEXP (SET_SRC (curr_set), 0))
14373               && REGNO (XEXP (SET_SRC (curr_set), 0))
14374                  == REGNO (SET_DEST (prev_set))
14375               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14376                               XEXP (SET_SRC (curr_set), 1)))
14377             return true;
14378         }
14379     }
14380
14381   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14382     {
14383
14384       /* We're trying to match:
14385          prev (movk) == (set (zero_extract (reg r0)
14386                                            (const_int 16)
14387                                            (const_int 32))
14388                              (const_int imm16_1))
14389          curr (movk) == (set (zero_extract (reg r0)
14390                                            (const_int 16)
14391                                            (const_int 48))
14392                              (const_int imm16_2))  */
14393
14394       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14395           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14396           && REG_P (XEXP (SET_DEST (prev_set), 0))
14397           && REG_P (XEXP (SET_DEST (curr_set), 0))
14398           && REGNO (XEXP (SET_DEST (prev_set), 0))
14399              == REGNO (XEXP (SET_DEST (curr_set), 0))
14400           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14401           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14402           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14403           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14404           && CONST_INT_P (SET_SRC (prev_set))
14405           && CONST_INT_P (SET_SRC (curr_set)))
14406         return true;
14407
14408     }
14409   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14410     {
14411       /* We're trying to match:
14412           prev (adrp) == (set (reg r0)
14413                               (high (symbol_ref ("SYM"))))
14414           curr (ldr) == (set (reg r1)
14415                              (mem (lo_sum (reg r0)
14416                                              (symbol_ref ("SYM")))))
14417                  or
14418           curr (ldr) == (set (reg r1)
14419                              (zero_extend (mem
14420                                            (lo_sum (reg r0)
14421                                                    (symbol_ref ("SYM"))))))  */
14422       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14423           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14424         {
14425           rtx curr_src = SET_SRC (curr_set);
14426
14427           if (GET_CODE (curr_src) == ZERO_EXTEND)
14428             curr_src = XEXP (curr_src, 0);
14429
14430           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14431               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14432               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14433                  == REGNO (SET_DEST (prev_set))
14434               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14435                               XEXP (SET_SRC (prev_set), 0)))
14436               return true;
14437         }
14438     }
14439
14440   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14441        && aarch_crypto_can_dual_issue (prev, curr))
14442     return true;
14443
14444   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14445       && any_condjump_p (curr))
14446     {
14447       enum attr_type prev_type = get_attr_type (prev);
14448
14449       unsigned int condreg1, condreg2;
14450       rtx cc_reg_1;
14451       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14452       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14453
14454       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14455           && prev
14456           && modified_in_p (cc_reg_1, prev))
14457         {
14458           /* FIXME: this misses some which is considered simple arthematic
14459              instructions for ThunderX.  Simple shifts are missed here.  */
14460           if (prev_type == TYPE_ALUS_SREG
14461               || prev_type == TYPE_ALUS_IMM
14462               || prev_type == TYPE_LOGICS_REG
14463               || prev_type == TYPE_LOGICS_IMM)
14464             return true;
14465         }
14466     }
14467
14468   if (prev_set
14469       && curr_set
14470       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14471       && any_condjump_p (curr))
14472     {
14473       /* We're trying to match:
14474           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14475           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14476                                                          (const_int 0))
14477                                                  (label_ref ("SYM"))
14478                                                  (pc))  */
14479       if (SET_DEST (curr_set) == (pc_rtx)
14480           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14481           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14482           && REG_P (SET_DEST (prev_set))
14483           && REGNO (SET_DEST (prev_set))
14484              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14485         {
14486           /* Fuse ALU operations followed by conditional branch instruction.  */
14487           switch (get_attr_type (prev))
14488             {
14489             case TYPE_ALU_IMM:
14490             case TYPE_ALU_SREG:
14491             case TYPE_ADC_REG:
14492             case TYPE_ADC_IMM:
14493             case TYPE_ADCS_REG:
14494             case TYPE_ADCS_IMM:
14495             case TYPE_LOGIC_REG:
14496             case TYPE_LOGIC_IMM:
14497             case TYPE_CSEL:
14498             case TYPE_ADR:
14499             case TYPE_MOV_IMM:
14500             case TYPE_SHIFT_REG:
14501             case TYPE_SHIFT_IMM:
14502             case TYPE_BFM:
14503             case TYPE_RBIT:
14504             case TYPE_REV:
14505             case TYPE_EXTEND:
14506               return true;
14507
14508             default:;
14509             }
14510         }
14511     }
14512
14513   return false;
14514 }
14515
14516 /* Return true iff the instruction fusion described by OP is enabled.  */
14517
14518 bool
14519 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14520 {
14521   return (aarch64_tune_params.fusible_ops & op) != 0;
14522 }
14523
14524 /* If MEM is in the form of [base+offset], extract the two parts
14525    of address and set to BASE and OFFSET, otherwise return false
14526    after clearing BASE and OFFSET.  */
14527
14528 bool
14529 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14530 {
14531   rtx addr;
14532
14533   gcc_assert (MEM_P (mem));
14534
14535   addr = XEXP (mem, 0);
14536
14537   if (REG_P (addr))
14538     {
14539       *base = addr;
14540       *offset = const0_rtx;
14541       return true;
14542     }
14543
14544   if (GET_CODE (addr) == PLUS
14545       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14546     {
14547       *base = XEXP (addr, 0);
14548       *offset = XEXP (addr, 1);
14549       return true;
14550     }
14551
14552   *base = NULL_RTX;
14553   *offset = NULL_RTX;
14554
14555   return false;
14556 }
14557
14558 /* Types for scheduling fusion.  */
14559 enum sched_fusion_type
14560 {
14561   SCHED_FUSION_NONE = 0,
14562   SCHED_FUSION_LD_SIGN_EXTEND,
14563   SCHED_FUSION_LD_ZERO_EXTEND,
14564   SCHED_FUSION_LD,
14565   SCHED_FUSION_ST,
14566   SCHED_FUSION_NUM
14567 };
14568
14569 /* If INSN is a load or store of address in the form of [base+offset],
14570    extract the two parts and set to BASE and OFFSET.  Return scheduling
14571    fusion type this INSN is.  */
14572
14573 static enum sched_fusion_type
14574 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14575 {
14576   rtx x, dest, src;
14577   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14578
14579   gcc_assert (INSN_P (insn));
14580   x = PATTERN (insn);
14581   if (GET_CODE (x) != SET)
14582     return SCHED_FUSION_NONE;
14583
14584   src = SET_SRC (x);
14585   dest = SET_DEST (x);
14586
14587   machine_mode dest_mode = GET_MODE (dest);
14588
14589   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14590     return SCHED_FUSION_NONE;
14591
14592   if (GET_CODE (src) == SIGN_EXTEND)
14593     {
14594       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14595       src = XEXP (src, 0);
14596       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14597         return SCHED_FUSION_NONE;
14598     }
14599   else if (GET_CODE (src) == ZERO_EXTEND)
14600     {
14601       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14602       src = XEXP (src, 0);
14603       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14604         return SCHED_FUSION_NONE;
14605     }
14606
14607   if (GET_CODE (src) == MEM && REG_P (dest))
14608     extract_base_offset_in_addr (src, base, offset);
14609   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14610     {
14611       fusion = SCHED_FUSION_ST;
14612       extract_base_offset_in_addr (dest, base, offset);
14613     }
14614   else
14615     return SCHED_FUSION_NONE;
14616
14617   if (*base == NULL_RTX || *offset == NULL_RTX)
14618     fusion = SCHED_FUSION_NONE;
14619
14620   return fusion;
14621 }
14622
14623 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14624
14625    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14626    and PRI are only calculated for these instructions.  For other instruction,
14627    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14628    type instruction fusion can be added by returning different priorities.
14629
14630    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14631
14632 static void
14633 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14634                                int *fusion_pri, int *pri)
14635 {
14636   int tmp, off_val;
14637   rtx base, offset;
14638   enum sched_fusion_type fusion;
14639
14640   gcc_assert (INSN_P (insn));
14641
14642   tmp = max_pri - 1;
14643   fusion = fusion_load_store (insn, &base, &offset);
14644   if (fusion == SCHED_FUSION_NONE)
14645     {
14646       *pri = tmp;
14647       *fusion_pri = tmp;
14648       return;
14649     }
14650
14651   /* Set FUSION_PRI according to fusion type and base register.  */
14652   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14653
14654   /* Calculate PRI.  */
14655   tmp /= 2;
14656
14657   /* INSN with smaller offset goes first.  */
14658   off_val = (int)(INTVAL (offset));
14659   if (off_val >= 0)
14660     tmp -= (off_val & 0xfffff);
14661   else
14662     tmp += ((- off_val) & 0xfffff);
14663
14664   *pri = tmp;
14665   return;
14666 }
14667
14668 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14669    Adjust priority of sha1h instructions so they are scheduled before
14670    other SHA1 instructions.  */
14671
14672 static int
14673 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14674 {
14675   rtx x = PATTERN (insn);
14676
14677   if (GET_CODE (x) == SET)
14678     {
14679       x = SET_SRC (x);
14680
14681       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14682         return priority + 10;
14683     }
14684
14685   return priority;
14686 }
14687
14688 /* Given OPERANDS of consecutive load/store, check if we can merge
14689    them into ldp/stp.  LOAD is true if they are load instructions.
14690    MODE is the mode of memory operands.  */
14691
14692 bool
14693 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14694                                 machine_mode mode)
14695 {
14696   HOST_WIDE_INT offval_1, offval_2, msize;
14697   enum reg_class rclass_1, rclass_2;
14698   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14699
14700   if (load)
14701     {
14702       mem_1 = operands[1];
14703       mem_2 = operands[3];
14704       reg_1 = operands[0];
14705       reg_2 = operands[2];
14706       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14707       if (REGNO (reg_1) == REGNO (reg_2))
14708         return false;
14709     }
14710   else
14711     {
14712       mem_1 = operands[0];
14713       mem_2 = operands[2];
14714       reg_1 = operands[1];
14715       reg_2 = operands[3];
14716     }
14717
14718   /* The mems cannot be volatile.  */
14719   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14720     return false;
14721
14722   /* If we have SImode and slow unaligned ldp,
14723      check the alignment to be at least 8 byte. */
14724   if (mode == SImode
14725       && (aarch64_tune_params.extra_tuning_flags
14726           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14727       && !optimize_size
14728       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14729     return false;
14730
14731   /* Check if the addresses are in the form of [base+offset].  */
14732   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14733   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14734     return false;
14735   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14736   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14737     return false;
14738
14739   /* Check if the bases are same.  */
14740   if (!rtx_equal_p (base_1, base_2))
14741     return false;
14742
14743   offval_1 = INTVAL (offset_1);
14744   offval_2 = INTVAL (offset_2);
14745   msize = GET_MODE_SIZE (mode);
14746   /* Check if the offsets are consecutive.  */
14747   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14748     return false;
14749
14750   /* Check if the addresses are clobbered by load.  */
14751   if (load)
14752     {
14753       if (reg_mentioned_p (reg_1, mem_1))
14754         return false;
14755
14756       /* In increasing order, the last load can clobber the address.  */
14757       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14758       return false;
14759     }
14760
14761   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14762     rclass_1 = FP_REGS;
14763   else
14764     rclass_1 = GENERAL_REGS;
14765
14766   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14767     rclass_2 = FP_REGS;
14768   else
14769     rclass_2 = GENERAL_REGS;
14770
14771   /* Check if the registers are of same class.  */
14772   if (rclass_1 != rclass_2)
14773     return false;
14774
14775   return true;
14776 }
14777
14778 /* Given OPERANDS of consecutive load/store, check if we can merge
14779    them into ldp/stp by adjusting the offset.  LOAD is true if they
14780    are load instructions.  MODE is the mode of memory operands.
14781
14782    Given below consecutive stores:
14783
14784      str  w1, [xb, 0x100]
14785      str  w1, [xb, 0x104]
14786      str  w1, [xb, 0x108]
14787      str  w1, [xb, 0x10c]
14788
14789    Though the offsets are out of the range supported by stp, we can
14790    still pair them after adjusting the offset, like:
14791
14792      add  scratch, xb, 0x100
14793      stp  w1, w1, [scratch]
14794      stp  w1, w1, [scratch, 0x8]
14795
14796    The peephole patterns detecting this opportunity should guarantee
14797    the scratch register is avaliable.  */
14798
14799 bool
14800 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14801                                        scalar_mode mode)
14802 {
14803   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14804   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14805   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14806   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14807
14808   if (load)
14809     {
14810       reg_1 = operands[0];
14811       mem_1 = operands[1];
14812       reg_2 = operands[2];
14813       mem_2 = operands[3];
14814       reg_3 = operands[4];
14815       mem_3 = operands[5];
14816       reg_4 = operands[6];
14817       mem_4 = operands[7];
14818       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14819                   && REG_P (reg_3) && REG_P (reg_4));
14820       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14821         return false;
14822     }
14823   else
14824     {
14825       mem_1 = operands[0];
14826       reg_1 = operands[1];
14827       mem_2 = operands[2];
14828       reg_2 = operands[3];
14829       mem_3 = operands[4];
14830       reg_3 = operands[5];
14831       mem_4 = operands[6];
14832       reg_4 = operands[7];
14833     }
14834   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14835   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14836     return false;
14837
14838   /* The mems cannot be volatile.  */
14839   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14840       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14841     return false;
14842
14843   /* Check if the addresses are in the form of [base+offset].  */
14844   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14845   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14846     return false;
14847   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14848   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14849     return false;
14850   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14851   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14852     return false;
14853   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14854   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14855     return false;
14856
14857   /* Check if the bases are same.  */
14858   if (!rtx_equal_p (base_1, base_2)
14859       || !rtx_equal_p (base_2, base_3)
14860       || !rtx_equal_p (base_3, base_4))
14861     return false;
14862
14863   offval_1 = INTVAL (offset_1);
14864   offval_2 = INTVAL (offset_2);
14865   offval_3 = INTVAL (offset_3);
14866   offval_4 = INTVAL (offset_4);
14867   msize = GET_MODE_SIZE (mode);
14868   /* Check if the offsets are consecutive.  */
14869   if ((offval_1 != (offval_2 + msize)
14870        || offval_1 != (offval_3 + msize * 2)
14871        || offval_1 != (offval_4 + msize * 3))
14872       && (offval_4 != (offval_3 + msize)
14873           || offval_4 != (offval_2 + msize * 2)
14874           || offval_4 != (offval_1 + msize * 3)))
14875     return false;
14876
14877   /* Check if the addresses are clobbered by load.  */
14878   if (load)
14879     {
14880       if (reg_mentioned_p (reg_1, mem_1)
14881           || reg_mentioned_p (reg_2, mem_2)
14882           || reg_mentioned_p (reg_3, mem_3))
14883         return false;
14884
14885       /* In increasing order, the last load can clobber the address.  */
14886       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14887         return false;
14888     }
14889
14890   /* If we have SImode and slow unaligned ldp,
14891      check the alignment to be at least 8 byte. */
14892   if (mode == SImode
14893       && (aarch64_tune_params.extra_tuning_flags
14894           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14895       && !optimize_size
14896       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14897     return false;
14898
14899   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14900     rclass_1 = FP_REGS;
14901   else
14902     rclass_1 = GENERAL_REGS;
14903
14904   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14905     rclass_2 = FP_REGS;
14906   else
14907     rclass_2 = GENERAL_REGS;
14908
14909   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14910     rclass_3 = FP_REGS;
14911   else
14912     rclass_3 = GENERAL_REGS;
14913
14914   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14915     rclass_4 = FP_REGS;
14916   else
14917     rclass_4 = GENERAL_REGS;
14918
14919   /* Check if the registers are of same class.  */
14920   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14921     return false;
14922
14923   return true;
14924 }
14925
14926 /* Given OPERANDS of consecutive load/store, this function pairs them
14927    into ldp/stp after adjusting the offset.  It depends on the fact
14928    that addresses of load/store instructions are in increasing order.
14929    MODE is the mode of memory operands.  CODE is the rtl operator
14930    which should be applied to all memory operands, it's SIGN_EXTEND,
14931    ZERO_EXTEND or UNKNOWN.  */
14932
14933 bool
14934 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14935                              scalar_mode mode, RTX_CODE code)
14936 {
14937   rtx base, offset, t1, t2;
14938   rtx mem_1, mem_2, mem_3, mem_4;
14939   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14940
14941   if (load)
14942     {
14943       mem_1 = operands[1];
14944       mem_2 = operands[3];
14945       mem_3 = operands[5];
14946       mem_4 = operands[7];
14947     }
14948   else
14949     {
14950       mem_1 = operands[0];
14951       mem_2 = operands[2];
14952       mem_3 = operands[4];
14953       mem_4 = operands[6];
14954       gcc_assert (code == UNKNOWN);
14955     }
14956
14957   extract_base_offset_in_addr (mem_1, &base, &offset);
14958   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14959
14960   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14961   msize = GET_MODE_SIZE (mode);
14962   stp_off_limit = msize * 0x40;
14963   off_val = INTVAL (offset);
14964   abs_off = (off_val < 0) ? -off_val : off_val;
14965   new_off = abs_off % stp_off_limit;
14966   adj_off = abs_off - new_off;
14967
14968   /* Further adjust to make sure all offsets are OK.  */
14969   if ((new_off + msize * 2) >= stp_off_limit)
14970     {
14971       adj_off += stp_off_limit;
14972       new_off -= stp_off_limit;
14973     }
14974
14975   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14976   if (adj_off >= 0x1000)
14977     return false;
14978
14979   if (off_val < 0)
14980     {
14981       adj_off = -adj_off;
14982       new_off = -new_off;
14983     }
14984
14985   /* Create new memory references.  */
14986   mem_1 = change_address (mem_1, VOIDmode,
14987                           plus_constant (DImode, operands[8], new_off));
14988
14989   /* Check if the adjusted address is OK for ldp/stp.  */
14990   if (!aarch64_mem_pair_operand (mem_1, mode))
14991     return false;
14992
14993   msize = GET_MODE_SIZE (mode);
14994   mem_2 = change_address (mem_2, VOIDmode,
14995                           plus_constant (DImode,
14996                                          operands[8],
14997                                          new_off + msize));
14998   mem_3 = change_address (mem_3, VOIDmode,
14999                           plus_constant (DImode,
15000                                          operands[8],
15001                                          new_off + msize * 2));
15002   mem_4 = change_address (mem_4, VOIDmode,
15003                           plus_constant (DImode,
15004                                          operands[8],
15005                                          new_off + msize * 3));
15006
15007   if (code == ZERO_EXTEND)
15008     {
15009       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15010       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15011       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15012       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15013     }
15014   else if (code == SIGN_EXTEND)
15015     {
15016       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15017       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15018       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15019       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15020     }
15021
15022   if (load)
15023     {
15024       operands[1] = mem_1;
15025       operands[3] = mem_2;
15026       operands[5] = mem_3;
15027       operands[7] = mem_4;
15028     }
15029   else
15030     {
15031       operands[0] = mem_1;
15032       operands[2] = mem_2;
15033       operands[4] = mem_3;
15034       operands[6] = mem_4;
15035     }
15036
15037   /* Emit adjusting instruction.  */
15038   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15039   /* Emit ldp/stp instructions.  */
15040   t1 = gen_rtx_SET (operands[0], operands[1]);
15041   t2 = gen_rtx_SET (operands[2], operands[3]);
15042   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15043   t1 = gen_rtx_SET (operands[4], operands[5]);
15044   t2 = gen_rtx_SET (operands[6], operands[7]);
15045   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15046   return true;
15047 }
15048
15049 /* Return 1 if pseudo register should be created and used to hold
15050    GOT address for PIC code.  */
15051
15052 bool
15053 aarch64_use_pseudo_pic_reg (void)
15054 {
15055   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15056 }
15057
15058 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15059
15060 static int
15061 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15062 {
15063   switch (XINT (x, 1))
15064     {
15065     case UNSPEC_GOTSMALLPIC:
15066     case UNSPEC_GOTSMALLPIC28K:
15067     case UNSPEC_GOTTINYPIC:
15068       return 0;
15069     default:
15070       break;
15071     }
15072
15073   return default_unspec_may_trap_p (x, flags);
15074 }
15075
15076
15077 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15078    return the log2 of that value.  Otherwise return -1.  */
15079
15080 int
15081 aarch64_fpconst_pow_of_2 (rtx x)
15082 {
15083   const REAL_VALUE_TYPE *r;
15084
15085   if (!CONST_DOUBLE_P (x))
15086     return -1;
15087
15088   r = CONST_DOUBLE_REAL_VALUE (x);
15089
15090   if (REAL_VALUE_NEGATIVE (*r)
15091       || REAL_VALUE_ISNAN (*r)
15092       || REAL_VALUE_ISINF (*r)
15093       || !real_isinteger (r, DFmode))
15094     return -1;
15095
15096   return exact_log2 (real_to_integer (r));
15097 }
15098
15099 /* If X is a vector of equal CONST_DOUBLE values and that value is
15100    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15101
15102 int
15103 aarch64_vec_fpconst_pow_of_2 (rtx x)
15104 {
15105   if (GET_CODE (x) != CONST_VECTOR)
15106     return -1;
15107
15108   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15109     return -1;
15110
15111   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15112   if (firstval <= 0)
15113     return -1;
15114
15115   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15116     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15117       return -1;
15118
15119   return firstval;
15120 }
15121
15122 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15123    to float.
15124
15125    __fp16 always promotes through this hook.
15126    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15127    through the generic excess precision logic rather than here.  */
15128
15129 static tree
15130 aarch64_promoted_type (const_tree t)
15131 {
15132   if (SCALAR_FLOAT_TYPE_P (t)
15133       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15134     return float_type_node;
15135
15136   return NULL_TREE;
15137 }
15138
15139 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15140
15141 static bool
15142 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15143                            optimization_type opt_type)
15144 {
15145   switch (op)
15146     {
15147     case rsqrt_optab:
15148       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15149
15150     default:
15151       return true;
15152     }
15153 }
15154
15155 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15156    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15157
15158 static bool
15159 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15160 {
15161   return (mode == HFmode
15162           ? true
15163           : default_libgcc_floating_mode_supported_p (mode));
15164 }
15165
15166 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15167    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15168
15169 static bool
15170 aarch64_scalar_mode_supported_p (scalar_mode mode)
15171 {
15172   return (mode == HFmode
15173           ? true
15174           : default_scalar_mode_supported_p (mode));
15175 }
15176
15177 /* Set the value of FLT_EVAL_METHOD.
15178    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15179
15180     0: evaluate all operations and constants, whose semantic type has at
15181        most the range and precision of type float, to the range and
15182        precision of float; evaluate all other operations and constants to
15183        the range and precision of the semantic type;
15184
15185     N, where _FloatN is a supported interchange floating type
15186        evaluate all operations and constants, whose semantic type has at
15187        most the range and precision of _FloatN type, to the range and
15188        precision of the _FloatN type; evaluate all other operations and
15189        constants to the range and precision of the semantic type;
15190
15191    If we have the ARMv8.2-A extensions then we support _Float16 in native
15192    precision, so we should set this to 16.  Otherwise, we support the type,
15193    but want to evaluate expressions in float precision, so set this to
15194    0.  */
15195
15196 static enum flt_eval_method
15197 aarch64_excess_precision (enum excess_precision_type type)
15198 {
15199   switch (type)
15200     {
15201       case EXCESS_PRECISION_TYPE_FAST:
15202       case EXCESS_PRECISION_TYPE_STANDARD:
15203         /* We can calculate either in 16-bit range and precision or
15204            32-bit range and precision.  Make that decision based on whether
15205            we have native support for the ARMv8.2-A 16-bit floating-point
15206            instructions or not.  */
15207         return (TARGET_FP_F16INST
15208                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15209                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15210       case EXCESS_PRECISION_TYPE_IMPLICIT:
15211         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15212       default:
15213         gcc_unreachable ();
15214     }
15215   return FLT_EVAL_METHOD_UNPREDICTABLE;
15216 }
15217
15218 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15219    scheduled for speculative execution.  Reject the long-running division
15220    and square-root instructions.  */
15221
15222 static bool
15223 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15224 {
15225   switch (get_attr_type (insn))
15226     {
15227       case TYPE_SDIV:
15228       case TYPE_UDIV:
15229       case TYPE_FDIVS:
15230       case TYPE_FDIVD:
15231       case TYPE_FSQRTS:
15232       case TYPE_FSQRTD:
15233       case TYPE_NEON_FP_SQRT_S:
15234       case TYPE_NEON_FP_SQRT_D:
15235       case TYPE_NEON_FP_SQRT_S_Q:
15236       case TYPE_NEON_FP_SQRT_D_Q:
15237       case TYPE_NEON_FP_DIV_S:
15238       case TYPE_NEON_FP_DIV_D:
15239       case TYPE_NEON_FP_DIV_S_Q:
15240       case TYPE_NEON_FP_DIV_D_Q:
15241         return false;
15242       default:
15243         return true;
15244     }
15245 }
15246
15247 /* Target-specific selftests.  */
15248
15249 #if CHECKING_P
15250
15251 namespace selftest {
15252
15253 /* Selftest for the RTL loader.
15254    Verify that the RTL loader copes with a dump from
15255    print_rtx_function.  This is essentially just a test that class
15256    function_reader can handle a real dump, but it also verifies
15257    that lookup_reg_by_dump_name correctly handles hard regs.
15258    The presence of hard reg names in the dump means that the test is
15259    target-specific, hence it is in this file.  */
15260
15261 static void
15262 aarch64_test_loading_full_dump ()
15263 {
15264   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15265
15266   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15267
15268   rtx_insn *insn_1 = get_insn_by_uid (1);
15269   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15270
15271   rtx_insn *insn_15 = get_insn_by_uid (15);
15272   ASSERT_EQ (INSN, GET_CODE (insn_15));
15273   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15274
15275   /* Verify crtl->return_rtx.  */
15276   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15277   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15278   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15279 }
15280
15281 /* Run all target-specific selftests.  */
15282
15283 static void
15284 aarch64_run_selftests (void)
15285 {
15286   aarch64_test_loading_full_dump ();
15287 }
15288
15289 } // namespace selftest
15290
15291 #endif /* #if CHECKING_P */
15292
15293 #undef TARGET_ADDRESS_COST
15294 #define TARGET_ADDRESS_COST aarch64_address_cost
15295
15296 /* This hook will determines whether unnamed bitfields affect the alignment
15297    of the containing structure.  The hook returns true if the structure
15298    should inherit the alignment requirements of an unnamed bitfield's
15299    type.  */
15300 #undef TARGET_ALIGN_ANON_BITFIELD
15301 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15302
15303 #undef TARGET_ASM_ALIGNED_DI_OP
15304 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15305
15306 #undef TARGET_ASM_ALIGNED_HI_OP
15307 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15308
15309 #undef TARGET_ASM_ALIGNED_SI_OP
15310 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15311
15312 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15313 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15314   hook_bool_const_tree_hwi_hwi_const_tree_true
15315
15316 #undef TARGET_ASM_FILE_START
15317 #define TARGET_ASM_FILE_START aarch64_start_file
15318
15319 #undef TARGET_ASM_OUTPUT_MI_THUNK
15320 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15321
15322 #undef TARGET_ASM_SELECT_RTX_SECTION
15323 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15324
15325 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15326 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15327
15328 #undef TARGET_BUILD_BUILTIN_VA_LIST
15329 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15330
15331 #undef TARGET_CALLEE_COPIES
15332 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15333
15334 #undef TARGET_CAN_ELIMINATE
15335 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15336
15337 #undef TARGET_CAN_INLINE_P
15338 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15339
15340 #undef TARGET_CANNOT_FORCE_CONST_MEM
15341 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15342
15343 #undef TARGET_CASE_VALUES_THRESHOLD
15344 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15345
15346 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15347 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15348
15349 /* Only the least significant bit is used for initialization guard
15350    variables.  */
15351 #undef TARGET_CXX_GUARD_MASK_BIT
15352 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15353
15354 #undef TARGET_C_MODE_FOR_SUFFIX
15355 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15356
15357 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15358 #undef  TARGET_DEFAULT_TARGET_FLAGS
15359 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15360 #endif
15361
15362 #undef TARGET_CLASS_MAX_NREGS
15363 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15364
15365 #undef TARGET_BUILTIN_DECL
15366 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15367
15368 #undef TARGET_BUILTIN_RECIPROCAL
15369 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15370
15371 #undef TARGET_C_EXCESS_PRECISION
15372 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15373
15374 #undef  TARGET_EXPAND_BUILTIN
15375 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15376
15377 #undef TARGET_EXPAND_BUILTIN_VA_START
15378 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15379
15380 #undef TARGET_FOLD_BUILTIN
15381 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15382
15383 #undef TARGET_FUNCTION_ARG
15384 #define TARGET_FUNCTION_ARG aarch64_function_arg
15385
15386 #undef TARGET_FUNCTION_ARG_ADVANCE
15387 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15388
15389 #undef TARGET_FUNCTION_ARG_BOUNDARY
15390 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15391
15392 #undef TARGET_FUNCTION_ARG_PADDING
15393 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15394
15395 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15396 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15397
15398 #undef TARGET_FUNCTION_VALUE
15399 #define TARGET_FUNCTION_VALUE aarch64_function_value
15400
15401 #undef TARGET_FUNCTION_VALUE_REGNO_P
15402 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15403
15404 #undef TARGET_FRAME_POINTER_REQUIRED
15405 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15406
15407 #undef TARGET_GIMPLE_FOLD_BUILTIN
15408 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15409
15410 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15411 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15412
15413 #undef  TARGET_INIT_BUILTINS
15414 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15415
15416 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15417 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15418   aarch64_ira_change_pseudo_allocno_class
15419
15420 #undef TARGET_LEGITIMATE_ADDRESS_P
15421 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15422
15423 #undef TARGET_LEGITIMATE_CONSTANT_P
15424 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15425
15426 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15427 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15428   aarch64_legitimize_address_displacement
15429
15430 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15431 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15432
15433 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15434 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15435 aarch64_libgcc_floating_mode_supported_p
15436
15437 #undef TARGET_MANGLE_TYPE
15438 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15439
15440 #undef TARGET_MEMORY_MOVE_COST
15441 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15442
15443 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15444 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15445
15446 #undef TARGET_MUST_PASS_IN_STACK
15447 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15448
15449 /* This target hook should return true if accesses to volatile bitfields
15450    should use the narrowest mode possible.  It should return false if these
15451    accesses should use the bitfield container type.  */
15452 #undef TARGET_NARROW_VOLATILE_BITFIELD
15453 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15454
15455 #undef  TARGET_OPTION_OVERRIDE
15456 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15457
15458 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15459 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15460   aarch64_override_options_after_change
15461
15462 #undef TARGET_OPTION_SAVE
15463 #define TARGET_OPTION_SAVE aarch64_option_save
15464
15465 #undef TARGET_OPTION_RESTORE
15466 #define TARGET_OPTION_RESTORE aarch64_option_restore
15467
15468 #undef TARGET_OPTION_PRINT
15469 #define TARGET_OPTION_PRINT aarch64_option_print
15470
15471 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15472 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15473
15474 #undef TARGET_SET_CURRENT_FUNCTION
15475 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15476
15477 #undef TARGET_PASS_BY_REFERENCE
15478 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15479
15480 #undef TARGET_PREFERRED_RELOAD_CLASS
15481 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15482
15483 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15484 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15485
15486 #undef TARGET_PROMOTED_TYPE
15487 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15488
15489 #undef TARGET_SECONDARY_RELOAD
15490 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15491
15492 #undef TARGET_SHIFT_TRUNCATION_MASK
15493 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15494
15495 #undef TARGET_SETUP_INCOMING_VARARGS
15496 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15497
15498 #undef TARGET_STRUCT_VALUE_RTX
15499 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15500
15501 #undef TARGET_REGISTER_MOVE_COST
15502 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15503
15504 #undef TARGET_RETURN_IN_MEMORY
15505 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15506
15507 #undef TARGET_RETURN_IN_MSB
15508 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15509
15510 #undef TARGET_RTX_COSTS
15511 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15512
15513 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15514 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15515
15516 #undef TARGET_SCHED_ISSUE_RATE
15517 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15518
15519 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15520 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15521   aarch64_sched_first_cycle_multipass_dfa_lookahead
15522
15523 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15524 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15525   aarch64_first_cycle_multipass_dfa_lookahead_guard
15526
15527 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15528 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15529   aarch64_get_separate_components
15530
15531 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15532 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15533   aarch64_components_for_bb
15534
15535 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15536 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15537   aarch64_disqualify_components
15538
15539 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15540 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15541   aarch64_emit_prologue_components
15542
15543 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15544 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15545   aarch64_emit_epilogue_components
15546
15547 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15548 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15549   aarch64_set_handled_components
15550
15551 #undef TARGET_TRAMPOLINE_INIT
15552 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15553
15554 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15555 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15556
15557 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15558 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15559
15560 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15561 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15562   aarch64_builtin_support_vector_misalignment
15563
15564 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15565 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15566
15567 #undef TARGET_VECTORIZE_ADD_STMT_COST
15568 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15569
15570 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15571 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15572   aarch64_builtin_vectorization_cost
15573
15574 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15575 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15576
15577 #undef TARGET_VECTORIZE_BUILTINS
15578 #define TARGET_VECTORIZE_BUILTINS
15579
15580 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15581 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15582   aarch64_builtin_vectorized_function
15583
15584 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15585 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15586   aarch64_autovectorize_vector_sizes
15587
15588 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15589 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15590   aarch64_atomic_assign_expand_fenv
15591
15592 /* Section anchor support.  */
15593
15594 #undef TARGET_MIN_ANCHOR_OFFSET
15595 #define TARGET_MIN_ANCHOR_OFFSET -256
15596
15597 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15598    byte offset; we can do much more for larger data types, but have no way
15599    to determine the size of the access.  We assume accesses are aligned.  */
15600 #undef TARGET_MAX_ANCHOR_OFFSET
15601 #define TARGET_MAX_ANCHOR_OFFSET 4095
15602
15603 #undef TARGET_VECTOR_ALIGNMENT
15604 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15605
15606 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15607 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15608   aarch64_simd_vector_alignment_reachable
15609
15610 /* vec_perm support.  */
15611
15612 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15613 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15614   aarch64_vectorize_vec_perm_const_ok
15615
15616 #undef TARGET_INIT_LIBFUNCS
15617 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15618
15619 #undef TARGET_FIXED_CONDITION_CODE_REGS
15620 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15621
15622 #undef TARGET_FLAGS_REGNUM
15623 #define TARGET_FLAGS_REGNUM CC_REGNUM
15624
15625 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15626 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15627
15628 #undef TARGET_ASAN_SHADOW_OFFSET
15629 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15630
15631 #undef TARGET_LEGITIMIZE_ADDRESS
15632 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15633
15634 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15635 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15636   aarch64_use_by_pieces_infrastructure_p
15637
15638 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15639 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15640
15641 #undef TARGET_CAN_USE_DOLOOP_P
15642 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15643
15644 #undef TARGET_SCHED_ADJUST_PRIORITY
15645 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15646
15647 #undef TARGET_SCHED_MACRO_FUSION_P
15648 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15649
15650 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15651 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15652
15653 #undef TARGET_SCHED_FUSION_PRIORITY
15654 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15655
15656 #undef TARGET_UNSPEC_MAY_TRAP_P
15657 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15658
15659 #undef TARGET_USE_PSEUDO_PIC_REG
15660 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15661
15662 #undef TARGET_PRINT_OPERAND
15663 #define TARGET_PRINT_OPERAND aarch64_print_operand
15664
15665 #undef TARGET_PRINT_OPERAND_ADDRESS
15666 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15667
15668 #undef TARGET_OPTAB_SUPPORTED_P
15669 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15670
15671 #undef TARGET_OMIT_STRUCT_RETURN_REG
15672 #define TARGET_OMIT_STRUCT_RETURN_REG true
15673
15674 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15675 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15676 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15677
15678 #undef TARGET_HARD_REGNO_NREGS
15679 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15680 #undef TARGET_HARD_REGNO_MODE_OK
15681 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15682
15683 #undef TARGET_MODES_TIEABLE_P
15684 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15685
15686 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15687 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15688   aarch64_hard_regno_call_part_clobbered
15689
15690 #if CHECKING_P
15691 #undef TARGET_RUN_TARGET_SELFTESTS
15692 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15693 #endif /* #if CHECKING_P */
15694
15695 struct gcc_target targetm = TARGET_INITIALIZER;
15696
15697 #include "gt-aarch64.h"