gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (machine_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 static const struct tune_params thunderx2t99_tunings =
 826 {
 827   &thunderx2t99_extra_costs,
 828   &thunderx2t99_addrcost_table,
 829   &thunderx2t99_regmove_cost,
 830   &thunderx2t99_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   4, /* memmov_cost.  */
 834   4, /* issue_rate.  */
 835   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 836    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 837   16,   /* function_align.  */
 838   8,    /* jump_align.  */
 839   16,   /* loop_align.  */
 840   3,    /* int_reassoc_width.  */
 841   2,    /* fp_reassoc_width.  */
 842   2,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &thunderx2t99_prefetch_tune
 849 };
 850
 851 /* Support for fine-grained override of the tuning structures.  */
 852 struct aarch64_tuning_override_function
 853 {
 854   const char* name;
 855   void (*parse_override)(const char*, struct tune_params*);
 856 };
 857
 858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 860
 861 static const struct aarch64_tuning_override_function
 862 aarch64_tuning_override_functions[] =
 863 {
 864   { "fuse", aarch64_parse_fuse_string },
 865   { "tune", aarch64_parse_tune_string },
 866   { NULL, NULL }
 867 };
 868
 869 /* A processor implementing AArch64.  */
 870 struct processor
 871 {
 872   const char *const name;
 873   enum aarch64_processor ident;
 874   enum aarch64_processor sched_core;
 875   enum aarch64_arch arch;
 876   unsigned architecture_version;
 877   const unsigned long flags;
 878   const struct tune_params *const tune;
 879 };
 880
 881 /* Architectures implementing AArch64.  */
 882 static const struct processor all_architectures[] =
 883 {
 884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 885   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 886 #include "aarch64-arches.def"
 887   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 888 };
 889
 890 /* Processor cores implementing AArch64.  */
 891 static const struct processor all_cores[] =
 892 {
 893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 894   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 895   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 896   FLAGS, &COSTS##_tunings},
 897 #include "aarch64-cores.def"
 898   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 899     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 900   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 901 };
 902
 903
 904 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 905    handling code or by target attributes.  */
 906 static const struct processor *selected_arch;
 907 static const struct processor *selected_cpu;
 908 static const struct processor *selected_tune;
 909
 910 /* The current tuning set.  */
 911 struct tune_params aarch64_tune_params = generic_tunings;
 912
 913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 914
 915 /* An ISA extension in the co-processor and main instruction set space.  */
 916 struct aarch64_option_extension
 917 {
 918   const char *const name;
 919   const unsigned long flags_on;
 920   const unsigned long flags_off;
 921 };
 922
 923 typedef enum aarch64_cond_code
 924 {
 925   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 926   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 927   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 928 }
 929 aarch64_cc;
 930
 931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 932
 933 /* The condition codes of the processor, and the inverse function.  */
 934 static const char * const aarch64_condition_codes[] =
 935 {
 936   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 937   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 938 };
 939
 940 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 941 const char *
 942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 943                         const char * branch_format)
 944 {
 945     rtx_code_label * tmp_label = gen_label_rtx ();
 946     char label_buf[256];
 947     char buffer[128];
 948     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 949                                  CODE_LABEL_NUMBER (tmp_label));
 950     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 951     rtx dest_label = operands[pos_label];
 952     operands[pos_label] = tmp_label;
 953
 954     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 955     output_asm_insn (buffer, operands);
 956
 957     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 958     operands[pos_label] = dest_label;
 959     output_asm_insn (buffer, operands);
 960     return "";
 961 }
 962
 963 void
 964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 965 {
 966   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 967   if (TARGET_GENERAL_REGS_ONLY)
 968     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 969   else
 970     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 971 }
 972
 973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 974    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 975    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 976    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 977    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 978    irrespectively of its cost results in bad allocations with many redundant
 979    int<->FP moves which are expensive on various cores.
 980    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 981    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 982    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 983    Otherwise set the allocno class depending on the mode.
 984    The result of this is that it is no longer inefficient to have a higher
 985    memory move cost than the register move cost.
 986 */
 987
 988 static reg_class_t
 989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 990                                          reg_class_t best_class)
 991 {
 992   machine_mode mode;
 993
 994   if (allocno_class != ALL_REGS)
 995     return allocno_class;
 996
 997   if (best_class != ALL_REGS)
 998     return best_class;
 999
1000   mode = PSEUDO_REGNO_MODE (regno);
1001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007   if (GET_MODE_UNIT_SIZE (mode) == 4)
1008     return aarch64_tune_params.min_div_recip_mul_sf;
1009   return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014                              machine_mode mode)
1015 {
1016   if (VECTOR_MODE_P (mode))
1017     return aarch64_tune_params.vec_reassoc_width;
1018   if (INTEGRAL_MODE_P (mode))
1019     return aarch64_tune_params.int_reassoc_width;
1020   if (FLOAT_MODE_P (mode))
1021     return aarch64_tune_params.fp_reassoc_width;
1022   return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029    if (GP_REGNUM_P (regno))
1030      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031    else if (regno == SP_REGNUM)
1032      return AARCH64_DWARF_SP;
1033    else if (FP_REGNUM_P (regno))
1034      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037       equivalent DWARF register.  */
1038    return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes.  */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045   return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes.  */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052   return aarch64_vector_mode_supported_p (mode)
1053          || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059                                 unsigned HOST_WIDE_INT nelems)
1060 {
1061   if (TARGET_SIMD
1062       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064       && (nelems >= 2 && nelems <= 4))
1065     return true;
1066
1067   return false;
1068 }
1069
1070 /* Implement HARD_REGNO_NREGS.  */
1071
1072 int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075   switch (aarch64_regno_regclass (regno))
1076     {
1077     case FP_REGS:
1078     case FP_LO_REGS:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080     default:
1081       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082     }
1083   gcc_unreachable ();
1084 }
1085
1086 /* Implement HARD_REGNO_MODE_OK.  */
1087
1088 int
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091   if (GET_MODE_CLASS (mode) == MODE_CC)
1092     return regno == CC_REGNUM;
1093
1094   if (regno == SP_REGNUM)
1095     /* The purpose of comparing with ptr_mode is to support the
1096        global register variable associated with the stack pointer
1097        register via the syntax of asm ("wsp") in ILP32.  */
1098     return mode == Pmode || mode == ptr_mode;
1099
1100   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101     return mode == Pmode;
1102
1103   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104     return 1;
1105
1106   if (FP_REGNUM_P (regno))
1107     {
1108       if (aarch64_vect_struct_mode_p (mode))
1109         return
1110           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111       else
1112         return 1;
1113     }
1114
1115   return 0;
1116 }
1117
1118 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1119    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1120    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1121
1122 static bool
1123 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1124 {
1125   return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1126 }
1127
1128 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1129 machine_mode
1130 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1131                                      machine_mode mode)
1132 {
1133   /* Handle modes that fit within single registers.  */
1134   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1135     {
1136       if (GET_MODE_SIZE (mode) >= 4)
1137         return mode;
1138       else
1139         return SImode;
1140     }
1141   /* Fall back to generic for multi-reg and very large modes.  */
1142   else
1143     return choose_hard_reg_mode (regno, nregs, false);
1144 }
1145
1146 /* Return true if calls to DECL should be treated as
1147    long-calls (ie called via a register).  */
1148 static bool
1149 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1150 {
1151   return false;
1152 }
1153
1154 /* Return true if calls to symbol-ref SYM should be treated as
1155    long-calls (ie called via a register).  */
1156 bool
1157 aarch64_is_long_call_p (rtx sym)
1158 {
1159   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1160 }
1161
1162 /* Return true if calls to symbol-ref SYM should not go through
1163    plt stubs.  */
1164
1165 bool
1166 aarch64_is_noplt_call_p (rtx sym)
1167 {
1168   const_tree decl = SYMBOL_REF_DECL (sym);
1169
1170   if (flag_pic
1171       && decl
1172       && (!flag_plt
1173           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1174       && !targetm.binds_local_p (decl))
1175     return true;
1176
1177   return false;
1178 }
1179
1180 /* Return true if the offsets to a zero/sign-extract operation
1181    represent an expression that matches an extend operation.  The
1182    operands represent the paramters from
1183
1184    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1185 bool
1186 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1187                                 rtx extract_imm)
1188 {
1189   HOST_WIDE_INT mult_val, extract_val;
1190
1191   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1192     return false;
1193
1194   mult_val = INTVAL (mult_imm);
1195   extract_val = INTVAL (extract_imm);
1196
1197   if (extract_val > 8
1198       && extract_val < GET_MODE_BITSIZE (mode)
1199       && exact_log2 (extract_val & ~7) > 0
1200       && (extract_val & 7) <= 4
1201       && mult_val == (1 << (extract_val & 7)))
1202     return true;
1203
1204   return false;
1205 }
1206
1207 /* Emit an insn that's a simple single-set.  Both the operands must be
1208    known to be valid.  */
1209 inline static rtx_insn *
1210 emit_set_insn (rtx x, rtx y)
1211 {
1212   return emit_insn (gen_rtx_SET (x, y));
1213 }
1214
1215 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1216    return the rtx for register 0 in the proper mode.  */
1217 rtx
1218 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1219 {
1220   machine_mode mode = SELECT_CC_MODE (code, x, y);
1221   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1222
1223   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1224   return cc_reg;
1225 }
1226
1227 /* Build the SYMBOL_REF for __tls_get_addr.  */
1228
1229 static GTY(()) rtx tls_get_addr_libfunc;
1230
1231 rtx
1232 aarch64_tls_get_addr (void)
1233 {
1234   if (!tls_get_addr_libfunc)
1235     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1236   return tls_get_addr_libfunc;
1237 }
1238
1239 /* Return the TLS model to use for ADDR.  */
1240
1241 static enum tls_model
1242 tls_symbolic_operand_type (rtx addr)
1243 {
1244   enum tls_model tls_kind = TLS_MODEL_NONE;
1245   rtx sym, addend;
1246
1247   if (GET_CODE (addr) == CONST)
1248     {
1249       split_const (addr, &sym, &addend);
1250       if (GET_CODE (sym) == SYMBOL_REF)
1251         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1252     }
1253   else if (GET_CODE (addr) == SYMBOL_REF)
1254     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1255
1256   return tls_kind;
1257 }
1258
1259 /* We'll allow lo_sum's in addresses in our legitimate addresses
1260    so that combine would take care of combining addresses where
1261    necessary, but for generation purposes, we'll generate the address
1262    as :
1263    RTL                               Absolute
1264    tmp = hi (symbol_ref);            adrp  x1, foo
1265    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1266                                      nop
1267
1268    PIC                               TLS
1269    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1270    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1271                                      bl   __tls_get_addr
1272                                      nop
1273
1274    Load TLS symbol, depending on TLS mechanism and TLS access model.
1275
1276    Global Dynamic - Traditional TLS:
1277    adrp tmp, :tlsgd:imm
1278    add  dest, tmp, #:tlsgd_lo12:imm
1279    bl   __tls_get_addr
1280
1281    Global Dynamic - TLS Descriptors:
1282    adrp dest, :tlsdesc:imm
1283    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1284    add  dest, dest, #:tlsdesc_lo12:imm
1285    blr  tmp
1286    mrs  tp, tpidr_el0
1287    add  dest, dest, tp
1288
1289    Initial Exec:
1290    mrs  tp, tpidr_el0
1291    adrp tmp, :gottprel:imm
1292    ldr  dest, [tmp, #:gottprel_lo12:imm]
1293    add  dest, dest, tp
1294
1295    Local Exec:
1296    mrs  tp, tpidr_el0
1297    add  t0, tp, #:tprel_hi12:imm, lsl #12
1298    add  t0, t0, #:tprel_lo12_nc:imm
1299 */
1300
1301 static void
1302 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1303                                    enum aarch64_symbol_type type)
1304 {
1305   switch (type)
1306     {
1307     case SYMBOL_SMALL_ABSOLUTE:
1308       {
1309         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1310         rtx tmp_reg = dest;
1311         machine_mode mode = GET_MODE (dest);
1312
1313         gcc_assert (mode == Pmode || mode == ptr_mode);
1314
1315         if (can_create_pseudo_p ())
1316           tmp_reg = gen_reg_rtx (mode);
1317
1318         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1319         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1320         return;
1321       }
1322
1323     case SYMBOL_TINY_ABSOLUTE:
1324       emit_insn (gen_rtx_SET (dest, imm));
1325       return;
1326
1327     case SYMBOL_SMALL_GOT_28K:
1328       {
1329         machine_mode mode = GET_MODE (dest);
1330         rtx gp_rtx = pic_offset_table_rtx;
1331         rtx insn;
1332         rtx mem;
1333
1334         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1335            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1336            decide rtx costs, in which case pic_offset_table_rtx is not
1337            initialized.  For that case no need to generate the first adrp
1338            instruction as the final cost for global variable access is
1339            one instruction.  */
1340         if (gp_rtx != NULL)
1341           {
1342             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1343                using the page base as GOT base, the first page may be wasted,
1344                in the worst scenario, there is only 28K space for GOT).
1345
1346                The generate instruction sequence for accessing global variable
1347                is:
1348
1349                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1350
1351                Only one instruction needed. But we must initialize
1352                pic_offset_table_rtx properly.  We generate initialize insn for
1353                every global access, and allow CSE to remove all redundant.
1354
1355                The final instruction sequences will look like the following
1356                for multiply global variables access.
1357
1358                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1359
1360                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1361                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1362                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1363                  ...  */
1364
1365             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1366             crtl->uses_pic_offset_table = 1;
1367             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1368
1369             if (mode != GET_MODE (gp_rtx))
1370              gp_rtx = gen_lowpart (mode, gp_rtx);
1371
1372           }
1373
1374         if (mode == ptr_mode)
1375           {
1376             if (mode == DImode)
1377               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1378             else
1379               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1380
1381             mem = XVECEXP (SET_SRC (insn), 0, 0);
1382           }
1383         else
1384           {
1385             gcc_assert (mode == Pmode);
1386
1387             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1388             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1389           }
1390
1391         /* The operand is expected to be MEM.  Whenever the related insn
1392            pattern changed, above code which calculate mem should be
1393            updated.  */
1394         gcc_assert (GET_CODE (mem) == MEM);
1395         MEM_READONLY_P (mem) = 1;
1396         MEM_NOTRAP_P (mem) = 1;
1397         emit_insn (insn);
1398         return;
1399       }
1400
1401     case SYMBOL_SMALL_GOT_4G:
1402       {
1403         /* In ILP32, the mode of dest can be either SImode or DImode,
1404            while the got entry is always of SImode size.  The mode of
1405            dest depends on how dest is used: if dest is assigned to a
1406            pointer (e.g. in the memory), it has SImode; it may have
1407            DImode if dest is dereferenced to access the memeory.
1408            This is why we have to handle three different ldr_got_small
1409            patterns here (two patterns for ILP32).  */
1410
1411         rtx insn;
1412         rtx mem;
1413         rtx tmp_reg = dest;
1414         machine_mode mode = GET_MODE (dest);
1415
1416         if (can_create_pseudo_p ())
1417           tmp_reg = gen_reg_rtx (mode);
1418
1419         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1420         if (mode == ptr_mode)
1421           {
1422             if (mode == DImode)
1423               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1424             else
1425               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1426
1427             mem = XVECEXP (SET_SRC (insn), 0, 0);
1428           }
1429         else
1430           {
1431             gcc_assert (mode == Pmode);
1432
1433             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1434             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1435           }
1436
1437         gcc_assert (GET_CODE (mem) == MEM);
1438         MEM_READONLY_P (mem) = 1;
1439         MEM_NOTRAP_P (mem) = 1;
1440         emit_insn (insn);
1441         return;
1442       }
1443
1444     case SYMBOL_SMALL_TLSGD:
1445       {
1446         rtx_insn *insns;
1447         machine_mode mode = GET_MODE (dest);
1448         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1449
1450         start_sequence ();
1451         if (TARGET_ILP32)
1452           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1453         else
1454           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1455         insns = get_insns ();
1456         end_sequence ();
1457
1458         RTL_CONST_CALL_P (insns) = 1;
1459         emit_libcall_block (insns, dest, result, imm);
1460         return;
1461       }
1462
1463     case SYMBOL_SMALL_TLSDESC:
1464       {
1465         machine_mode mode = GET_MODE (dest);
1466         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1467         rtx tp;
1468
1469         gcc_assert (mode == Pmode || mode == ptr_mode);
1470
1471         /* In ILP32, the got entry is always of SImode size.  Unlike
1472            small GOT, the dest is fixed at reg 0.  */
1473         if (TARGET_ILP32)
1474           emit_insn (gen_tlsdesc_small_si (imm));
1475         else
1476           emit_insn (gen_tlsdesc_small_di (imm));
1477         tp = aarch64_load_tp (NULL);
1478
1479         if (mode != Pmode)
1480           tp = gen_lowpart (mode, tp);
1481
1482         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1483         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1484         return;
1485       }
1486
1487     case SYMBOL_SMALL_TLSIE:
1488       {
1489         /* In ILP32, the mode of dest can be either SImode or DImode,
1490            while the got entry is always of SImode size.  The mode of
1491            dest depends on how dest is used: if dest is assigned to a
1492            pointer (e.g. in the memory), it has SImode; it may have
1493            DImode if dest is dereferenced to access the memeory.
1494            This is why we have to handle three different tlsie_small
1495            patterns here (two patterns for ILP32).  */
1496         machine_mode mode = GET_MODE (dest);
1497         rtx tmp_reg = gen_reg_rtx (mode);
1498         rtx tp = aarch64_load_tp (NULL);
1499
1500         if (mode == ptr_mode)
1501           {
1502             if (mode == DImode)
1503               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1504             else
1505               {
1506                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1507                 tp = gen_lowpart (mode, tp);
1508               }
1509           }
1510         else
1511           {
1512             gcc_assert (mode == Pmode);
1513             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1514           }
1515
1516         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1517         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518         return;
1519       }
1520
1521     case SYMBOL_TLSLE12:
1522     case SYMBOL_TLSLE24:
1523     case SYMBOL_TLSLE32:
1524     case SYMBOL_TLSLE48:
1525       {
1526         machine_mode mode = GET_MODE (dest);
1527         rtx tp = aarch64_load_tp (NULL);
1528
1529         if (mode != Pmode)
1530           tp = gen_lowpart (mode, tp);
1531
1532         switch (type)
1533           {
1534           case SYMBOL_TLSLE12:
1535             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1536                         (dest, tp, imm));
1537             break;
1538           case SYMBOL_TLSLE24:
1539             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1540                         (dest, tp, imm));
1541           break;
1542           case SYMBOL_TLSLE32:
1543             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1544                         (dest, imm));
1545             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1546                         (dest, dest, tp));
1547           break;
1548           case SYMBOL_TLSLE48:
1549             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1550                         (dest, imm));
1551             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1552                         (dest, dest, tp));
1553             break;
1554           default:
1555             gcc_unreachable ();
1556           }
1557
1558         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1559         return;
1560       }
1561
1562     case SYMBOL_TINY_GOT:
1563       emit_insn (gen_ldr_got_tiny (dest, imm));
1564       return;
1565
1566     case SYMBOL_TINY_TLSIE:
1567       {
1568         machine_mode mode = GET_MODE (dest);
1569         rtx tp = aarch64_load_tp (NULL);
1570
1571         if (mode == ptr_mode)
1572           {
1573             if (mode == DImode)
1574               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1575             else
1576               {
1577                 tp = gen_lowpart (mode, tp);
1578                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1579               }
1580           }
1581         else
1582           {
1583             gcc_assert (mode == Pmode);
1584             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1585           }
1586
1587         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1588         return;
1589       }
1590
1591     default:
1592       gcc_unreachable ();
1593     }
1594 }
1595
1596 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1597    handle all moves if !can_create_pseudo_p ().  The distinction is
1598    important because, unlike emit_move_insn, the move expanders know
1599    how to force Pmode objects into the constant pool even when the
1600    constant pool address is not itself legitimate.  */
1601 static rtx
1602 aarch64_emit_move (rtx dest, rtx src)
1603 {
1604   return (can_create_pseudo_p ()
1605           ? emit_move_insn (dest, src)
1606           : emit_move_insn_1 (dest, src));
1607 }
1608
1609 /* Split a 128-bit move operation into two 64-bit move operations,
1610    taking care to handle partial overlap of register to register
1611    copies.  Special cases are needed when moving between GP regs and
1612    FP regs.  SRC can be a register, constant or memory; DST a register
1613    or memory.  If either operand is memory it must not have any side
1614    effects.  */
1615 void
1616 aarch64_split_128bit_move (rtx dst, rtx src)
1617 {
1618   rtx dst_lo, dst_hi;
1619   rtx src_lo, src_hi;
1620
1621   machine_mode mode = GET_MODE (dst);
1622
1623   gcc_assert (mode == TImode || mode == TFmode);
1624   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1625   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1626
1627   if (REG_P (dst) && REG_P (src))
1628     {
1629       int src_regno = REGNO (src);
1630       int dst_regno = REGNO (dst);
1631
1632       /* Handle FP <-> GP regs.  */
1633       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1634         {
1635           src_lo = gen_lowpart (word_mode, src);
1636           src_hi = gen_highpart (word_mode, src);
1637
1638           if (mode == TImode)
1639             {
1640               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1641               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1642             }
1643           else
1644             {
1645               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1646               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1647             }
1648           return;
1649         }
1650       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1651         {
1652           dst_lo = gen_lowpart (word_mode, dst);
1653           dst_hi = gen_highpart (word_mode, dst);
1654
1655           if (mode == TImode)
1656             {
1657               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1658               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1659             }
1660           else
1661             {
1662               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1663               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1664             }
1665           return;
1666         }
1667     }
1668
1669   dst_lo = gen_lowpart (word_mode, dst);
1670   dst_hi = gen_highpart (word_mode, dst);
1671   src_lo = gen_lowpart (word_mode, src);
1672   src_hi = gen_highpart_mode (word_mode, mode, src);
1673
1674   /* At most one pairing may overlap.  */
1675   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1676     {
1677       aarch64_emit_move (dst_hi, src_hi);
1678       aarch64_emit_move (dst_lo, src_lo);
1679     }
1680   else
1681     {
1682       aarch64_emit_move (dst_lo, src_lo);
1683       aarch64_emit_move (dst_hi, src_hi);
1684     }
1685 }
1686
1687 bool
1688 aarch64_split_128bit_move_p (rtx dst, rtx src)
1689 {
1690   return (! REG_P (src)
1691           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1692 }
1693
1694 /* Split a complex SIMD combine.  */
1695
1696 void
1697 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1698 {
1699   machine_mode src_mode = GET_MODE (src1);
1700   machine_mode dst_mode = GET_MODE (dst);
1701
1702   gcc_assert (VECTOR_MODE_P (dst_mode));
1703   gcc_assert (register_operand (dst, dst_mode)
1704               && register_operand (src1, src_mode)
1705               && register_operand (src2, src_mode));
1706
1707   rtx (*gen) (rtx, rtx, rtx);
1708
1709   switch (src_mode)
1710     {
1711     case E_V8QImode:
1712       gen = gen_aarch64_simd_combinev8qi;
1713       break;
1714     case E_V4HImode:
1715       gen = gen_aarch64_simd_combinev4hi;
1716       break;
1717     case E_V2SImode:
1718       gen = gen_aarch64_simd_combinev2si;
1719       break;
1720     case E_V4HFmode:
1721       gen = gen_aarch64_simd_combinev4hf;
1722       break;
1723     case E_V2SFmode:
1724       gen = gen_aarch64_simd_combinev2sf;
1725       break;
1726     case E_DImode:
1727       gen = gen_aarch64_simd_combinedi;
1728       break;
1729     case E_DFmode:
1730       gen = gen_aarch64_simd_combinedf;
1731       break;
1732     default:
1733       gcc_unreachable ();
1734     }
1735
1736   emit_insn (gen (dst, src1, src2));
1737   return;
1738 }
1739
1740 /* Split a complex SIMD move.  */
1741
1742 void
1743 aarch64_split_simd_move (rtx dst, rtx src)
1744 {
1745   machine_mode src_mode = GET_MODE (src);
1746   machine_mode dst_mode = GET_MODE (dst);
1747
1748   gcc_assert (VECTOR_MODE_P (dst_mode));
1749
1750   if (REG_P (dst) && REG_P (src))
1751     {
1752       rtx (*gen) (rtx, rtx);
1753
1754       gcc_assert (VECTOR_MODE_P (src_mode));
1755
1756       switch (src_mode)
1757         {
1758         case E_V16QImode:
1759           gen = gen_aarch64_split_simd_movv16qi;
1760           break;
1761         case E_V8HImode:
1762           gen = gen_aarch64_split_simd_movv8hi;
1763           break;
1764         case E_V4SImode:
1765           gen = gen_aarch64_split_simd_movv4si;
1766           break;
1767         case E_V2DImode:
1768           gen = gen_aarch64_split_simd_movv2di;
1769           break;
1770         case E_V8HFmode:
1771           gen = gen_aarch64_split_simd_movv8hf;
1772           break;
1773         case E_V4SFmode:
1774           gen = gen_aarch64_split_simd_movv4sf;
1775           break;
1776         case E_V2DFmode:
1777           gen = gen_aarch64_split_simd_movv2df;
1778           break;
1779         default:
1780           gcc_unreachable ();
1781         }
1782
1783       emit_insn (gen (dst, src));
1784       return;
1785     }
1786 }
1787
1788 bool
1789 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1790                               machine_mode ymode, rtx y)
1791 {
1792   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1793   gcc_assert (r != NULL);
1794   return rtx_equal_p (x, r);
1795 }
1796
1797
1798 static rtx
1799 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1800 {
1801   if (can_create_pseudo_p ())
1802     return force_reg (mode, value);
1803   else
1804     {
1805       x = aarch64_emit_move (x, value);
1806       return x;
1807     }
1808 }
1809
1810
1811 static rtx
1812 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1813 {
1814   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1815     {
1816       rtx high;
1817       /* Load the full offset into a register.  This
1818          might be improvable in the future.  */
1819       high = GEN_INT (offset);
1820       offset = 0;
1821       high = aarch64_force_temporary (mode, temp, high);
1822       reg = aarch64_force_temporary (mode, temp,
1823                                      gen_rtx_PLUS (mode, high, reg));
1824     }
1825   return plus_constant (mode, reg, offset);
1826 }
1827
1828 static int
1829 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1830                                 machine_mode mode)
1831 {
1832   int i;
1833   unsigned HOST_WIDE_INT val, val2, mask;
1834   int one_match, zero_match;
1835   int num_insns;
1836
1837   val = INTVAL (imm);
1838
1839   if (aarch64_move_imm (val, mode))
1840     {
1841       if (generate)
1842         emit_insn (gen_rtx_SET (dest, imm));
1843       return 1;
1844     }
1845
1846   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1847      (with XXXX non-zero). In that case check to see if the move can be done in
1848      a smaller mode.  */
1849   val2 = val & 0xffffffff;
1850   if (mode == DImode
1851       && aarch64_move_imm (val2, SImode)
1852       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1853     {
1854       if (generate)
1855         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1856
1857       /* Check if we have to emit a second instruction by checking to see
1858          if any of the upper 32 bits of the original DI mode value is set.  */
1859       if (val == val2)
1860         return 1;
1861
1862       i = (val >> 48) ? 48 : 32;
1863
1864       if (generate)
1865          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1866                                     GEN_INT ((val >> i) & 0xffff)));
1867
1868       return 2;
1869     }
1870
1871   if ((val >> 32) == 0 || mode == SImode)
1872     {
1873       if (generate)
1874         {
1875           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1876           if (mode == SImode)
1877             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1878                                        GEN_INT ((val >> 16) & 0xffff)));
1879           else
1880             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1881                                        GEN_INT ((val >> 16) & 0xffff)));
1882         }
1883       return 2;
1884     }
1885
1886   /* Remaining cases are all for DImode.  */
1887
1888   mask = 0xffff;
1889   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1890     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1891   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1892     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1893
1894   if (zero_match != 2 && one_match != 2)
1895     {
1896       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1897          For a 64-bit bitmask try whether changing 16 bits to all ones or
1898          zeroes creates a valid bitmask.  To check any repeated bitmask,
1899          try using 16 bits from the other 32-bit half of val.  */
1900
1901       for (i = 0; i < 64; i += 16, mask <<= 16)
1902         {
1903           val2 = val & ~mask;
1904           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1905             break;
1906           val2 = val | mask;
1907           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1908             break;
1909           val2 = val2 & ~mask;
1910           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1911           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1912             break;
1913         }
1914       if (i != 64)
1915         {
1916           if (generate)
1917             {
1918               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1919               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1920                                          GEN_INT ((val >> i) & 0xffff)));
1921             }
1922           return 2;
1923         }
1924     }
1925
1926   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1927      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1928      otherwise skip zero bits.  */
1929
1930   num_insns = 1;
1931   mask = 0xffff;
1932   val2 = one_match > zero_match ? ~val : val;
1933   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1934
1935   if (generate)
1936     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1937                                            ? (val | ~(mask << i))
1938                                            : (val & (mask << i)))));
1939   for (i += 16; i < 64; i += 16)
1940     {
1941       if ((val2 & (mask << i)) == 0)
1942         continue;
1943       if (generate)
1944         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1945                                    GEN_INT ((val >> i) & 0xffff)));
1946       num_insns ++;
1947     }
1948
1949   return num_insns;
1950 }
1951
1952
1953 void
1954 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1955 {
1956   machine_mode mode = GET_MODE (dest);
1957
1958   gcc_assert (mode == SImode || mode == DImode);
1959
1960   /* Check on what type of symbol it is.  */
1961   if (GET_CODE (imm) == SYMBOL_REF
1962       || GET_CODE (imm) == LABEL_REF
1963       || GET_CODE (imm) == CONST)
1964     {
1965       rtx mem, base, offset;
1966       enum aarch64_symbol_type sty;
1967
1968       /* If we have (const (plus symbol offset)), separate out the offset
1969          before we start classifying the symbol.  */
1970       split_const (imm, &base, &offset);
1971
1972       sty = aarch64_classify_symbol (base, offset);
1973       switch (sty)
1974         {
1975         case SYMBOL_FORCE_TO_MEM:
1976           if (offset != const0_rtx
1977               && targetm.cannot_force_const_mem (mode, imm))
1978             {
1979               gcc_assert (can_create_pseudo_p ());
1980               base = aarch64_force_temporary (mode, dest, base);
1981               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1982               aarch64_emit_move (dest, base);
1983               return;
1984             }
1985
1986           mem = force_const_mem (ptr_mode, imm);
1987           gcc_assert (mem);
1988
1989           /* If we aren't generating PC relative literals, then
1990              we need to expand the literal pool access carefully.
1991              This is something that needs to be done in a number
1992              of places, so could well live as a separate function.  */
1993           if (!aarch64_pcrelative_literal_loads)
1994             {
1995               gcc_assert (can_create_pseudo_p ());
1996               base = gen_reg_rtx (ptr_mode);
1997               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1998               if (ptr_mode != Pmode)
1999                 base = convert_memory_address (Pmode, base);
2000               mem = gen_rtx_MEM (ptr_mode, base);
2001             }
2002
2003           if (mode != ptr_mode)
2004             mem = gen_rtx_ZERO_EXTEND (mode, mem);
2005
2006           emit_insn (gen_rtx_SET (dest, mem));
2007
2008           return;
2009
2010         case SYMBOL_SMALL_TLSGD:
2011         case SYMBOL_SMALL_TLSDESC:
2012         case SYMBOL_SMALL_TLSIE:
2013         case SYMBOL_SMALL_GOT_28K:
2014         case SYMBOL_SMALL_GOT_4G:
2015         case SYMBOL_TINY_GOT:
2016         case SYMBOL_TINY_TLSIE:
2017           if (offset != const0_rtx)
2018             {
2019               gcc_assert(can_create_pseudo_p ());
2020               base = aarch64_force_temporary (mode, dest, base);
2021               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2022               aarch64_emit_move (dest, base);
2023               return;
2024             }
2025           /* FALLTHRU */
2026
2027         case SYMBOL_SMALL_ABSOLUTE:
2028         case SYMBOL_TINY_ABSOLUTE:
2029         case SYMBOL_TLSLE12:
2030         case SYMBOL_TLSLE24:
2031         case SYMBOL_TLSLE32:
2032         case SYMBOL_TLSLE48:
2033           aarch64_load_symref_appropriately (dest, imm, sty);
2034           return;
2035
2036         default:
2037           gcc_unreachable ();
2038         }
2039     }
2040
2041   if (!CONST_INT_P (imm))
2042     {
2043       if (GET_CODE (imm) == HIGH)
2044         emit_insn (gen_rtx_SET (dest, imm));
2045       else
2046         {
2047           rtx mem = force_const_mem (mode, imm);
2048           gcc_assert (mem);
2049           emit_insn (gen_rtx_SET (dest, mem));
2050         }
2051
2052       return;
2053     }
2054
2055   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2056 }
2057
2058 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2059    temporary value if necessary.  FRAME_RELATED_P should be true if
2060    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2061    to the generated instructions.  If SCRATCHREG is known to hold
2062    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2063    immediate again.
2064
2065    Since this function may be used to adjust the stack pointer, we must
2066    ensure that it cannot cause transient stack deallocation (for example
2067    by first incrementing SP and then decrementing when adjusting by a
2068    large immediate).  */
2069
2070 static void
2071 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2072                                HOST_WIDE_INT delta, bool frame_related_p,
2073                                bool emit_move_imm)
2074 {
2075   HOST_WIDE_INT mdelta = abs_hwi (delta);
2076   rtx this_rtx = gen_rtx_REG (mode, regnum);
2077   rtx_insn *insn;
2078
2079   if (!mdelta)
2080     return;
2081
2082   /* Single instruction adjustment.  */
2083   if (aarch64_uimm12_shift (mdelta))
2084     {
2085       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2086       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2087       return;
2088     }
2089
2090   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2091      Only do this if mdelta is not a 16-bit move as adjusting using a move
2092      is better.  */
2093   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2094     {
2095       HOST_WIDE_INT low_off = mdelta & 0xfff;
2096
2097       low_off = delta < 0 ? -low_off : low_off;
2098       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2099       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2100       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2101       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2102       return;
2103     }
2104
2105   /* Emit a move immediate if required and an addition/subtraction.  */
2106   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2107   if (emit_move_imm)
2108     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2109   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2110                               : gen_add2_insn (this_rtx, scratch_rtx));
2111   if (frame_related_p)
2112     {
2113       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2114       rtx adj = plus_constant (mode, this_rtx, delta);
2115       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2116     }
2117 }
2118
2119 static inline void
2120 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2121                       HOST_WIDE_INT delta)
2122 {
2123   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2124 }
2125
2126 static inline void
2127 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2128 {
2129   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2130                                  true, emit_move_imm);
2131 }
2132
2133 static inline void
2134 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2135 {
2136   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2137                                  frame_related_p, true);
2138 }
2139
2140 static bool
2141 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2142                                  tree exp ATTRIBUTE_UNUSED)
2143 {
2144   /* Currently, always true.  */
2145   return true;
2146 }
2147
2148 /* Implement TARGET_PASS_BY_REFERENCE.  */
2149
2150 static bool
2151 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2152                            machine_mode mode,
2153                            const_tree type,
2154                            bool named ATTRIBUTE_UNUSED)
2155 {
2156   HOST_WIDE_INT size;
2157   machine_mode dummymode;
2158   int nregs;
2159
2160   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2161   size = (mode == BLKmode && type)
2162     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2163
2164   /* Aggregates are passed by reference based on their size.  */
2165   if (type && AGGREGATE_TYPE_P (type))
2166     {
2167       size = int_size_in_bytes (type);
2168     }
2169
2170   /* Variable sized arguments are always returned by reference.  */
2171   if (size < 0)
2172     return true;
2173
2174   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2175   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2176                                                &dummymode, &nregs,
2177                                                NULL))
2178     return false;
2179
2180   /* Arguments which are variable sized or larger than 2 registers are
2181      passed by reference unless they are a homogenous floating point
2182      aggregate.  */
2183   return size > 2 * UNITS_PER_WORD;
2184 }
2185
2186 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2187 static bool
2188 aarch64_return_in_msb (const_tree valtype)
2189 {
2190   machine_mode dummy_mode;
2191   int dummy_int;
2192
2193   /* Never happens in little-endian mode.  */
2194   if (!BYTES_BIG_ENDIAN)
2195     return false;
2196
2197   /* Only composite types smaller than or equal to 16 bytes can
2198      be potentially returned in registers.  */
2199   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2200       || int_size_in_bytes (valtype) <= 0
2201       || int_size_in_bytes (valtype) > 16)
2202     return false;
2203
2204   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2205      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2206      is always passed/returned in the least significant bits of fp/simd
2207      register(s).  */
2208   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2209                                                &dummy_mode, &dummy_int, NULL))
2210     return false;
2211
2212   return true;
2213 }
2214
2215 /* Implement TARGET_FUNCTION_VALUE.
2216    Define how to find the value returned by a function.  */
2217
2218 static rtx
2219 aarch64_function_value (const_tree type, const_tree func,
2220                         bool outgoing ATTRIBUTE_UNUSED)
2221 {
2222   machine_mode mode;
2223   int unsignedp;
2224   int count;
2225   machine_mode ag_mode;
2226
2227   mode = TYPE_MODE (type);
2228   if (INTEGRAL_TYPE_P (type))
2229     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2230
2231   if (aarch64_return_in_msb (type))
2232     {
2233       HOST_WIDE_INT size = int_size_in_bytes (type);
2234
2235       if (size % UNITS_PER_WORD != 0)
2236         {
2237           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2238           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2239         }
2240     }
2241
2242   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2243                                                &ag_mode, &count, NULL))
2244     {
2245       if (!aarch64_composite_type_p (type, mode))
2246         {
2247           gcc_assert (count == 1 && mode == ag_mode);
2248           return gen_rtx_REG (mode, V0_REGNUM);
2249         }
2250       else
2251         {
2252           int i;
2253           rtx par;
2254
2255           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2256           for (i = 0; i < count; i++)
2257             {
2258               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2259               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2260                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2261               XVECEXP (par, 0, i) = tmp;
2262             }
2263           return par;
2264         }
2265     }
2266   else
2267     return gen_rtx_REG (mode, R0_REGNUM);
2268 }
2269
2270 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2271    Return true if REGNO is the number of a hard register in which the values
2272    of called function may come back.  */
2273
2274 static bool
2275 aarch64_function_value_regno_p (const unsigned int regno)
2276 {
2277   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2278      of 16-byte return values are: 128-bit integers and 16-byte small
2279      structures (excluding homogeneous floating-point aggregates).  */
2280   if (regno == R0_REGNUM || regno == R1_REGNUM)
2281     return true;
2282
2283   /* Up to four fp/simd registers can return a function value, e.g. a
2284      homogeneous floating-point aggregate having four members.  */
2285   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2286     return TARGET_FLOAT;
2287
2288   return false;
2289 }
2290
2291 /* Implement TARGET_RETURN_IN_MEMORY.
2292
2293    If the type T of the result of a function is such that
2294      void func (T arg)
2295    would require that arg be passed as a value in a register (or set of
2296    registers) according to the parameter passing rules, then the result
2297    is returned in the same registers as would be used for such an
2298    argument.  */
2299
2300 static bool
2301 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2302 {
2303   HOST_WIDE_INT size;
2304   machine_mode ag_mode;
2305   int count;
2306
2307   if (!AGGREGATE_TYPE_P (type)
2308       && TREE_CODE (type) != COMPLEX_TYPE
2309       && TREE_CODE (type) != VECTOR_TYPE)
2310     /* Simple scalar types always returned in registers.  */
2311     return false;
2312
2313   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2314                                                type,
2315                                                &ag_mode,
2316                                                &count,
2317                                                NULL))
2318     return false;
2319
2320   /* Types larger than 2 registers returned in memory.  */
2321   size = int_size_in_bytes (type);
2322   return (size < 0 || size > 2 * UNITS_PER_WORD);
2323 }
2324
2325 static bool
2326 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2327                                const_tree type, int *nregs)
2328 {
2329   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2330   return aarch64_vfp_is_call_or_return_candidate (mode,
2331                                                   type,
2332                                                   &pcum->aapcs_vfp_rmode,
2333                                                   nregs,
2334                                                   NULL);
2335 }
2336
2337 /* Given MODE and TYPE of a function argument, return the alignment in
2338    bits.  The idea is to suppress any stronger alignment requested by
2339    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2340    This is a helper function for local use only.  */
2341
2342 static unsigned int
2343 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2344 {
2345   if (!type)
2346     return GET_MODE_ALIGNMENT (mode);
2347
2348   if (integer_zerop (TYPE_SIZE (type)))
2349     return 0;
2350
2351   gcc_assert (TYPE_MODE (type) == mode);
2352
2353   if (!AGGREGATE_TYPE_P (type))
2354     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2355
2356   if (TREE_CODE (type) == ARRAY_TYPE)
2357     return TYPE_ALIGN (TREE_TYPE (type));
2358
2359   unsigned int alignment = 0;
2360   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2361     if (TREE_CODE (field) == FIELD_DECL)
2362       alignment = std::max (alignment, DECL_ALIGN (field));
2363
2364   return alignment;
2365 }
2366
2367 /* Layout a function argument according to the AAPCS64 rules.  The rule
2368    numbers refer to the rule numbers in the AAPCS64.  */
2369
2370 static void
2371 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2372                     const_tree type,
2373                     bool named ATTRIBUTE_UNUSED)
2374 {
2375   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2376   int ncrn, nvrn, nregs;
2377   bool allocate_ncrn, allocate_nvrn;
2378   HOST_WIDE_INT size;
2379
2380   /* We need to do this once per argument.  */
2381   if (pcum->aapcs_arg_processed)
2382     return;
2383
2384   pcum->aapcs_arg_processed = true;
2385
2386   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2387   size
2388     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2389                 UNITS_PER_WORD);
2390
2391   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2392   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2393                                                  mode,
2394                                                  type,
2395                                                  &nregs);
2396
2397   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2398      The following code thus handles passing by SIMD/FP registers first.  */
2399
2400   nvrn = pcum->aapcs_nvrn;
2401
2402   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2403      and homogenous short-vector aggregates (HVA).  */
2404   if (allocate_nvrn)
2405     {
2406       if (!TARGET_FLOAT)
2407         aarch64_err_no_fpadvsimd (mode, "argument");
2408
2409       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2410         {
2411           pcum->aapcs_nextnvrn = nvrn + nregs;
2412           if (!aarch64_composite_type_p (type, mode))
2413             {
2414               gcc_assert (nregs == 1);
2415               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2416             }
2417           else
2418             {
2419               rtx par;
2420               int i;
2421               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2422               for (i = 0; i < nregs; i++)
2423                 {
2424                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2425                                          V0_REGNUM + nvrn + i);
2426                   tmp = gen_rtx_EXPR_LIST
2427                     (VOIDmode, tmp,
2428                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2429                   XVECEXP (par, 0, i) = tmp;
2430                 }
2431               pcum->aapcs_reg = par;
2432             }
2433           return;
2434         }
2435       else
2436         {
2437           /* C.3 NSRN is set to 8.  */
2438           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2439           goto on_stack;
2440         }
2441     }
2442
2443   ncrn = pcum->aapcs_ncrn;
2444   nregs = size / UNITS_PER_WORD;
2445
2446   /* C6 - C9.  though the sign and zero extension semantics are
2447      handled elsewhere.  This is the case where the argument fits
2448      entirely general registers.  */
2449   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2450     {
2451
2452       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2453
2454       /* C.8 if the argument has an alignment of 16 then the NGRN is
2455          rounded up to the next even number.  */
2456       if (nregs == 2
2457           && ncrn % 2
2458           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2459              comparison is there because for > 16 * BITS_PER_UNIT
2460              alignment nregs should be > 2 and therefore it should be
2461              passed by reference rather than value.  */
2462           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2463         {
2464           ++ncrn;
2465           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2466         }
2467
2468       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2469          A reg is still generated for it, but the caller should be smart
2470          enough not to use it.  */
2471       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2472         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2473       else
2474         {
2475           rtx par;
2476           int i;
2477
2478           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2479           for (i = 0; i < nregs; i++)
2480             {
2481               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2482               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2483                                        GEN_INT (i * UNITS_PER_WORD));
2484               XVECEXP (par, 0, i) = tmp;
2485             }
2486           pcum->aapcs_reg = par;
2487         }
2488
2489       pcum->aapcs_nextncrn = ncrn + nregs;
2490       return;
2491     }
2492
2493   /* C.11  */
2494   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2495
2496   /* The argument is passed on stack; record the needed number of words for
2497      this argument and align the total size if necessary.  */
2498 on_stack:
2499   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2500
2501   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2502     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2503                                        16 / UNITS_PER_WORD);
2504   return;
2505 }
2506
2507 /* Implement TARGET_FUNCTION_ARG.  */
2508
2509 static rtx
2510 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2511                       const_tree type, bool named)
2512 {
2513   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2514   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2515
2516   if (mode == VOIDmode)
2517     return NULL_RTX;
2518
2519   aarch64_layout_arg (pcum_v, mode, type, named);
2520   return pcum->aapcs_reg;
2521 }
2522
2523 void
2524 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2525                            const_tree fntype ATTRIBUTE_UNUSED,
2526                            rtx libname ATTRIBUTE_UNUSED,
2527                            const_tree fndecl ATTRIBUTE_UNUSED,
2528                            unsigned n_named ATTRIBUTE_UNUSED)
2529 {
2530   pcum->aapcs_ncrn = 0;
2531   pcum->aapcs_nvrn = 0;
2532   pcum->aapcs_nextncrn = 0;
2533   pcum->aapcs_nextnvrn = 0;
2534   pcum->pcs_variant = ARM_PCS_AAPCS64;
2535   pcum->aapcs_reg = NULL_RTX;
2536   pcum->aapcs_arg_processed = false;
2537   pcum->aapcs_stack_words = 0;
2538   pcum->aapcs_stack_size = 0;
2539
2540   if (!TARGET_FLOAT
2541       && fndecl && TREE_PUBLIC (fndecl)
2542       && fntype && fntype != error_mark_node)
2543     {
2544       const_tree type = TREE_TYPE (fntype);
2545       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2546       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2547       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2548                                                    &mode, &nregs, NULL))
2549         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2550     }
2551   return;
2552 }
2553
2554 static void
2555 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2556                               machine_mode mode,
2557                               const_tree type,
2558                               bool named)
2559 {
2560   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2561   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2562     {
2563       aarch64_layout_arg (pcum_v, mode, type, named);
2564       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2565                   != (pcum->aapcs_stack_words != 0));
2566       pcum->aapcs_arg_processed = false;
2567       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2568       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2569       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2570       pcum->aapcs_stack_words = 0;
2571       pcum->aapcs_reg = NULL_RTX;
2572     }
2573 }
2574
2575 bool
2576 aarch64_function_arg_regno_p (unsigned regno)
2577 {
2578   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2579           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2580 }
2581
2582 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2583    PARM_BOUNDARY bits of alignment, but will be given anything up
2584    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2585    that both before and after the layout of each argument, the Next
2586    Stacked Argument Address (NSAA) will have a minimum alignment of
2587    8 bytes.  */
2588
2589 static unsigned int
2590 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2591 {
2592   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2593   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2594 }
2595
2596 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2597
2598    Return true if an argument passed on the stack should be padded upwards,
2599    i.e. if the least-significant byte of the stack slot has useful data.
2600
2601    Small aggregate types are placed in the lowest memory address.
2602
2603    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2604
2605 bool
2606 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2607 {
2608   /* On little-endian targets, the least significant byte of every stack
2609      argument is passed at the lowest byte address of the stack slot.  */
2610   if (!BYTES_BIG_ENDIAN)
2611     return true;
2612
2613   /* Otherwise, integral, floating-point and pointer types are padded downward:
2614      the least significant byte of a stack argument is passed at the highest
2615      byte address of the stack slot.  */
2616   if (type
2617       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2618          || POINTER_TYPE_P (type))
2619       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2620     return false;
2621
2622   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2623   return true;
2624 }
2625
2626 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2627
2628    It specifies padding for the last (may also be the only)
2629    element of a block move between registers and memory.  If
2630    assuming the block is in the memory, padding upward means that
2631    the last element is padded after its highest significant byte,
2632    while in downward padding, the last element is padded at the
2633    its least significant byte side.
2634
2635    Small aggregates and small complex types are always padded
2636    upwards.
2637
2638    We don't need to worry about homogeneous floating-point or
2639    short-vector aggregates; their move is not affected by the
2640    padding direction determined here.  Regardless of endianness,
2641    each element of such an aggregate is put in the least
2642    significant bits of a fp/simd register.
2643
2644    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2645    register has useful data, and return the opposite if the most
2646    significant byte does.  */
2647
2648 bool
2649 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2650                      bool first ATTRIBUTE_UNUSED)
2651 {
2652
2653   /* Small composite types are always padded upward.  */
2654   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2655     {
2656       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2657                             : GET_MODE_SIZE (mode));
2658       if (size < 2 * UNITS_PER_WORD)
2659         return true;
2660     }
2661
2662   /* Otherwise, use the default padding.  */
2663   return !BYTES_BIG_ENDIAN;
2664 }
2665
2666 static scalar_int_mode
2667 aarch64_libgcc_cmp_return_mode (void)
2668 {
2669   return SImode;
2670 }
2671
2672 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2673
2674 /* We use the 12-bit shifted immediate arithmetic instructions so values
2675    must be multiple of (1 << 12), i.e. 4096.  */
2676 #define ARITH_FACTOR 4096
2677
2678 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2679 #error Cannot use simple address calculation for stack probing
2680 #endif
2681
2682 /* The pair of scratch registers used for stack probing.  */
2683 #define PROBE_STACK_FIRST_REG  9
2684 #define PROBE_STACK_SECOND_REG 10
2685
2686 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2687    inclusive.  These are offsets from the current stack pointer.  */
2688
2689 static void
2690 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2691 {
2692   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2693
2694   /* See the same assertion on PROBE_INTERVAL above.  */
2695   gcc_assert ((first % ARITH_FACTOR) == 0);
2696
2697   /* See if we have a constant small number of probes to generate.  If so,
2698      that's the easy case.  */
2699   if (size <= PROBE_INTERVAL)
2700     {
2701       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2702
2703       emit_set_insn (reg1,
2704                      plus_constant (Pmode,
2705                                     stack_pointer_rtx, -(first + base)));
2706       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2707     }
2708
2709   /* The run-time loop is made up of 8 insns in the generic case while the
2710      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2711   else if (size <= 4 * PROBE_INTERVAL)
2712     {
2713       HOST_WIDE_INT i, rem;
2714
2715       emit_set_insn (reg1,
2716                      plus_constant (Pmode,
2717                                     stack_pointer_rtx,
2718                                     -(first + PROBE_INTERVAL)));
2719       emit_stack_probe (reg1);
2720
2721       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2722          it exceeds SIZE.  If only two probes are needed, this will not
2723          generate any code.  Then probe at FIRST + SIZE.  */
2724       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2725         {
2726           emit_set_insn (reg1,
2727                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2728           emit_stack_probe (reg1);
2729         }
2730
2731       rem = size - (i - PROBE_INTERVAL);
2732       if (rem > 256)
2733         {
2734           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2735
2736           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2737           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2738         }
2739       else
2740         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2741     }
2742
2743   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2744      extra careful with variables wrapping around because we might be at
2745      the very top (or the very bottom) of the address space and we have
2746      to be able to handle this case properly; in particular, we use an
2747      equality test for the loop condition.  */
2748   else
2749     {
2750       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2751
2752       /* Step 1: round SIZE to the previous multiple of the interval.  */
2753
2754       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2755
2756
2757       /* Step 2: compute initial and final value of the loop counter.  */
2758
2759       /* TEST_ADDR = SP + FIRST.  */
2760       emit_set_insn (reg1,
2761                      plus_constant (Pmode, stack_pointer_rtx, -first));
2762
2763       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2764       HOST_WIDE_INT adjustment = - (first + rounded_size);
2765       if (! aarch64_uimm12_shift (adjustment))
2766         {
2767           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2768                                           true, Pmode);
2769           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2770         }
2771       else
2772         {
2773           emit_set_insn (reg2,
2774                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2775         }
2776
2777       /* Step 3: the loop
2778
2779          do
2780            {
2781              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2782              probe at TEST_ADDR
2783            }
2784          while (TEST_ADDR != LAST_ADDR)
2785
2786          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2787          until it is equal to ROUNDED_SIZE.  */
2788
2789       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2790
2791
2792       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2793          that SIZE is equal to ROUNDED_SIZE.  */
2794
2795       if (size != rounded_size)
2796         {
2797           HOST_WIDE_INT rem = size - rounded_size;
2798
2799           if (rem > 256)
2800             {
2801               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2802
2803               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2804               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2805             }
2806           else
2807             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2808         }
2809     }
2810
2811   /* Make sure nothing is scheduled before we are done.  */
2812   emit_insn (gen_blockage ());
2813 }
2814
2815 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2816    absolute addresses.  */
2817
2818 const char *
2819 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2820 {
2821   static int labelno = 0;
2822   char loop_lab[32];
2823   rtx xops[2];
2824
2825   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2826
2827   /* Loop.  */
2828   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2829
2830   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2831   xops[0] = reg1;
2832   xops[1] = GEN_INT (PROBE_INTERVAL);
2833   output_asm_insn ("sub\t%0, %0, %1", xops);
2834
2835   /* Probe at TEST_ADDR.  */
2836   output_asm_insn ("str\txzr, [%0]", xops);
2837
2838   /* Test if TEST_ADDR == LAST_ADDR.  */
2839   xops[1] = reg2;
2840   output_asm_insn ("cmp\t%0, %1", xops);
2841
2842   /* Branch.  */
2843   fputs ("\tb.ne\t", asm_out_file);
2844   assemble_name_raw (asm_out_file, loop_lab);
2845   fputc ('\n', asm_out_file);
2846
2847   return "";
2848 }
2849
2850 static bool
2851 aarch64_frame_pointer_required (void)
2852 {
2853   /* In aarch64_override_options_after_change
2854      flag_omit_leaf_frame_pointer turns off the frame pointer by
2855      default.  Turn it back on now if we've not got a leaf
2856      function.  */
2857   if (flag_omit_leaf_frame_pointer
2858       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2859     return true;
2860
2861   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2862   if (crtl->calls_eh_return)
2863     return true;
2864
2865   return false;
2866 }
2867
2868 /* Mark the registers that need to be saved by the callee and calculate
2869    the size of the callee-saved registers area and frame record (both FP
2870    and LR may be omitted).  */
2871 static void
2872 aarch64_layout_frame (void)
2873 {
2874   HOST_WIDE_INT offset = 0;
2875   int regno, last_fp_reg = INVALID_REGNUM;
2876
2877   if (reload_completed && cfun->machine->frame.laid_out)
2878     return;
2879
2880 #define SLOT_NOT_REQUIRED (-2)
2881 #define SLOT_REQUIRED     (-1)
2882
2883   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2884   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2885
2886   /* First mark all the registers that really need to be saved...  */
2887   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2888     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2889
2890   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2891     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2892
2893   /* ... that includes the eh data registers (if needed)...  */
2894   if (crtl->calls_eh_return)
2895     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2896       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2897         = SLOT_REQUIRED;
2898
2899   /* ... and any callee saved register that dataflow says is live.  */
2900   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2901     if (df_regs_ever_live_p (regno)
2902         && (regno == R30_REGNUM
2903             || !call_used_regs[regno]))
2904       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2905
2906   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2907     if (df_regs_ever_live_p (regno)
2908         && !call_used_regs[regno])
2909       {
2910         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2911         last_fp_reg = regno;
2912       }
2913
2914   if (frame_pointer_needed)
2915     {
2916       /* FP and LR are placed in the linkage record.  */
2917       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2918       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2919       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2920       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2921       offset += 2 * UNITS_PER_WORD;
2922     }
2923
2924   /* Now assign stack slots for them.  */
2925   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2926     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2927       {
2928         cfun->machine->frame.reg_offset[regno] = offset;
2929         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2930           cfun->machine->frame.wb_candidate1 = regno;
2931         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2932           cfun->machine->frame.wb_candidate2 = regno;
2933         offset += UNITS_PER_WORD;
2934       }
2935
2936   HOST_WIDE_INT max_int_offset = offset;
2937   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2938   bool has_align_gap = offset != max_int_offset;
2939
2940   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2941     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2942       {
2943         /* If there is an alignment gap between integer and fp callee-saves,
2944            allocate the last fp register to it if possible.  */
2945         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2946           {
2947             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2948             break;
2949           }
2950
2951         cfun->machine->frame.reg_offset[regno] = offset;
2952         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2953           cfun->machine->frame.wb_candidate1 = regno;
2954         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2955                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2956           cfun->machine->frame.wb_candidate2 = regno;
2957         offset += UNITS_PER_WORD;
2958       }
2959
2960   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2961
2962   cfun->machine->frame.saved_regs_size = offset;
2963
2964   HOST_WIDE_INT varargs_and_saved_regs_size
2965     = offset + cfun->machine->frame.saved_varargs_size;
2966
2967   cfun->machine->frame.hard_fp_offset
2968     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2969                 STACK_BOUNDARY / BITS_PER_UNIT);
2970
2971   cfun->machine->frame.frame_size
2972     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2973                 + crtl->outgoing_args_size,
2974                 STACK_BOUNDARY / BITS_PER_UNIT);
2975
2976   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2977
2978   cfun->machine->frame.initial_adjust = 0;
2979   cfun->machine->frame.final_adjust = 0;
2980   cfun->machine->frame.callee_adjust = 0;
2981   cfun->machine->frame.callee_offset = 0;
2982
2983   HOST_WIDE_INT max_push_offset = 0;
2984   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2985     max_push_offset = 512;
2986   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2987     max_push_offset = 256;
2988
2989   if (cfun->machine->frame.frame_size < max_push_offset
2990       && crtl->outgoing_args_size == 0)
2991     {
2992       /* Simple, small frame with no outgoing arguments:
2993          stp reg1, reg2, [sp, -frame_size]!
2994          stp reg3, reg4, [sp, 16]  */
2995       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2996     }
2997   else if ((crtl->outgoing_args_size
2998             + cfun->machine->frame.saved_regs_size < 512)
2999            && !(cfun->calls_alloca
3000                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3001     {
3002       /* Frame with small outgoing arguments:
3003          sub sp, sp, frame_size
3004          stp reg1, reg2, [sp, outgoing_args_size]
3005          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3006       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3007       cfun->machine->frame.callee_offset
3008         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3009     }
3010   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3011     {
3012       /* Frame with large outgoing arguments but a small local area:
3013          stp reg1, reg2, [sp, -hard_fp_offset]!
3014          stp reg3, reg4, [sp, 16]
3015          sub sp, sp, outgoing_args_size  */
3016       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3017       cfun->machine->frame.final_adjust
3018         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3019     }
3020   else if (!frame_pointer_needed
3021            && varargs_and_saved_regs_size < max_push_offset)
3022     {
3023       /* Frame with large local area and outgoing arguments (this pushes the
3024          callee-saves first, followed by the locals and outgoing area):
3025          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3026          stp reg3, reg4, [sp, 16]
3027          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3028       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3029       cfun->machine->frame.final_adjust
3030         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3031       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3032       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3033     }
3034   else
3035     {
3036       /* Frame with large local area and outgoing arguments using frame pointer:
3037          sub sp, sp, hard_fp_offset
3038          stp x29, x30, [sp, 0]
3039          add x29, sp, 0
3040          stp reg3, reg4, [sp, 16]
3041          sub sp, sp, outgoing_args_size  */
3042       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3043       cfun->machine->frame.final_adjust
3044         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3045     }
3046
3047   cfun->machine->frame.laid_out = true;
3048 }
3049
3050 /* Return true if the register REGNO is saved on entry to
3051    the current function.  */
3052
3053 static bool
3054 aarch64_register_saved_on_entry (int regno)
3055 {
3056   return cfun->machine->frame.reg_offset[regno] >= 0;
3057 }
3058
3059 /* Return the next register up from REGNO up to LIMIT for the callee
3060    to save.  */
3061
3062 static unsigned
3063 aarch64_next_callee_save (unsigned regno, unsigned limit)
3064 {
3065   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3066     regno ++;
3067   return regno;
3068 }
3069
3070 /* Push the register number REGNO of mode MODE to the stack with write-back
3071    adjusting the stack by ADJUSTMENT.  */
3072
3073 static void
3074 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3075                            HOST_WIDE_INT adjustment)
3076  {
3077   rtx base_rtx = stack_pointer_rtx;
3078   rtx insn, reg, mem;
3079
3080   reg = gen_rtx_REG (mode, regno);
3081   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3082                             plus_constant (Pmode, base_rtx, -adjustment));
3083   mem = gen_frame_mem (mode, mem);
3084
3085   insn = emit_move_insn (mem, reg);
3086   RTX_FRAME_RELATED_P (insn) = 1;
3087 }
3088
3089 /* Generate and return an instruction to store the pair of registers
3090    REG and REG2 of mode MODE to location BASE with write-back adjusting
3091    the stack location BASE by ADJUSTMENT.  */
3092
3093 static rtx
3094 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3095                           HOST_WIDE_INT adjustment)
3096 {
3097   switch (mode)
3098     {
3099     case E_DImode:
3100       return gen_storewb_pairdi_di (base, base, reg, reg2,
3101                                     GEN_INT (-adjustment),
3102                                     GEN_INT (UNITS_PER_WORD - adjustment));
3103     case E_DFmode:
3104       return gen_storewb_pairdf_di (base, base, reg, reg2,
3105                                     GEN_INT (-adjustment),
3106                                     GEN_INT (UNITS_PER_WORD - adjustment));
3107     default:
3108       gcc_unreachable ();
3109     }
3110 }
3111
3112 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3113    stack pointer by ADJUSTMENT.  */
3114
3115 static void
3116 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3117 {
3118   rtx_insn *insn;
3119   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3120
3121   if (regno2 == INVALID_REGNUM)
3122     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3123
3124   rtx reg1 = gen_rtx_REG (mode, regno1);
3125   rtx reg2 = gen_rtx_REG (mode, regno2);
3126
3127   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3128                                               reg2, adjustment));
3129   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3130   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3131   RTX_FRAME_RELATED_P (insn) = 1;
3132 }
3133
3134 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3135    adjusting it by ADJUSTMENT afterwards.  */
3136
3137 static rtx
3138 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3139                          HOST_WIDE_INT adjustment)
3140 {
3141   switch (mode)
3142     {
3143     case E_DImode:
3144       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3145                                    GEN_INT (UNITS_PER_WORD));
3146     case E_DFmode:
3147       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3148                                    GEN_INT (UNITS_PER_WORD));
3149     default:
3150       gcc_unreachable ();
3151     }
3152 }
3153
3154 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3155    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3156    into CFI_OPS.  */
3157
3158 static void
3159 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3160                   rtx *cfi_ops)
3161 {
3162   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3163   rtx reg1 = gen_rtx_REG (mode, regno1);
3164
3165   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3166
3167   if (regno2 == INVALID_REGNUM)
3168     {
3169       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3170       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3171       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3172     }
3173   else
3174     {
3175       rtx reg2 = gen_rtx_REG (mode, regno2);
3176       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3177       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3178                                           reg2, adjustment));
3179     }
3180 }
3181
3182 /* Generate and return a store pair instruction of mode MODE to store
3183    register REG1 to MEM1 and register REG2 to MEM2.  */
3184
3185 static rtx
3186 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3187                         rtx reg2)
3188 {
3189   switch (mode)
3190     {
3191     case E_DImode:
3192       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3193
3194     case E_DFmode:
3195       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3196
3197     default:
3198       gcc_unreachable ();
3199     }
3200 }
3201
3202 /* Generate and regurn a load pair isntruction of mode MODE to load register
3203    REG1 from MEM1 and register REG2 from MEM2.  */
3204
3205 static rtx
3206 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3207                        rtx mem2)
3208 {
3209   switch (mode)
3210     {
3211     case E_DImode:
3212       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3213
3214     case E_DFmode:
3215       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3216
3217     default:
3218       gcc_unreachable ();
3219     }
3220 }
3221
3222 /* Return TRUE if return address signing should be enabled for the current
3223    function, otherwise return FALSE.  */
3224
3225 bool
3226 aarch64_return_address_signing_enabled (void)
3227 {
3228   /* This function should only be called after frame laid out.   */
3229   gcc_assert (cfun->machine->frame.laid_out);
3230
3231   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3232      if it's LR is pushed onto stack.  */
3233   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3234           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3235               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3236 }
3237
3238 /* Emit code to save the callee-saved registers from register number START
3239    to LIMIT to the stack at the location starting at offset START_OFFSET,
3240    skipping any write-back candidates if SKIP_WB is true.  */
3241
3242 static void
3243 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3244                            unsigned start, unsigned limit, bool skip_wb)
3245 {
3246   rtx_insn *insn;
3247   unsigned regno;
3248   unsigned regno2;
3249
3250   for (regno = aarch64_next_callee_save (start, limit);
3251        regno <= limit;
3252        regno = aarch64_next_callee_save (regno + 1, limit))
3253     {
3254       rtx reg, mem;
3255       HOST_WIDE_INT offset;
3256
3257       if (skip_wb
3258           && (regno == cfun->machine->frame.wb_candidate1
3259               || regno == cfun->machine->frame.wb_candidate2))
3260         continue;
3261
3262       if (cfun->machine->reg_is_wrapped_separately[regno])
3263        continue;
3264
3265       reg = gen_rtx_REG (mode, regno);
3266       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3267       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3268                                                 offset));
3269
3270       regno2 = aarch64_next_callee_save (regno + 1, limit);
3271
3272       if (regno2 <= limit
3273           && !cfun->machine->reg_is_wrapped_separately[regno2]
3274           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3275               == cfun->machine->frame.reg_offset[regno2]))
3276
3277         {
3278           rtx reg2 = gen_rtx_REG (mode, regno2);
3279           rtx mem2;
3280
3281           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3282           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3283                                                      offset));
3284           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3285                                                     reg2));
3286
3287           /* The first part of a frame-related parallel insn is
3288              always assumed to be relevant to the frame
3289              calculations; subsequent parts, are only
3290              frame-related if explicitly marked.  */
3291           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3292           regno = regno2;
3293         }
3294       else
3295         insn = emit_move_insn (mem, reg);
3296
3297       RTX_FRAME_RELATED_P (insn) = 1;
3298     }
3299 }
3300
3301 /* Emit code to restore the callee registers of mode MODE from register
3302    number START up to and including LIMIT.  Restore from the stack offset
3303    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3304    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3305
3306 static void
3307 aarch64_restore_callee_saves (machine_mode mode,
3308                               HOST_WIDE_INT start_offset, unsigned start,
3309                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3310 {
3311   rtx base_rtx = stack_pointer_rtx;
3312   unsigned regno;
3313   unsigned regno2;
3314   HOST_WIDE_INT offset;
3315
3316   for (regno = aarch64_next_callee_save (start, limit);
3317        regno <= limit;
3318        regno = aarch64_next_callee_save (regno + 1, limit))
3319     {
3320       if (cfun->machine->reg_is_wrapped_separately[regno])
3321        continue;
3322
3323       rtx reg, mem;
3324
3325       if (skip_wb
3326           && (regno == cfun->machine->frame.wb_candidate1
3327               || regno == cfun->machine->frame.wb_candidate2))
3328         continue;
3329
3330       reg = gen_rtx_REG (mode, regno);
3331       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3332       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3333
3334       regno2 = aarch64_next_callee_save (regno + 1, limit);
3335
3336       if (regno2 <= limit
3337           && !cfun->machine->reg_is_wrapped_separately[regno2]
3338           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3339               == cfun->machine->frame.reg_offset[regno2]))
3340         {
3341           rtx reg2 = gen_rtx_REG (mode, regno2);
3342           rtx mem2;
3343
3344           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3345           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3346           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3347
3348           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3349           regno = regno2;
3350         }
3351       else
3352         emit_move_insn (reg, mem);
3353       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3354     }
3355 }
3356
3357 static inline bool
3358 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3359                                HOST_WIDE_INT offset)
3360 {
3361   return offset >= -256 && offset < 256;
3362 }
3363
3364 static inline bool
3365 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3366 {
3367   return (offset >= 0
3368           && offset < 4096 * GET_MODE_SIZE (mode)
3369           && offset % GET_MODE_SIZE (mode) == 0);
3370 }
3371
3372 bool
3373 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3374 {
3375   return (offset >= -64 * GET_MODE_SIZE (mode)
3376           && offset < 64 * GET_MODE_SIZE (mode)
3377           && offset % GET_MODE_SIZE (mode) == 0);
3378 }
3379
3380 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3381
3382 static sbitmap
3383 aarch64_get_separate_components (void)
3384 {
3385   aarch64_layout_frame ();
3386
3387   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3388   bitmap_clear (components);
3389
3390   /* The registers we need saved to the frame.  */
3391   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3392     if (aarch64_register_saved_on_entry (regno))
3393       {
3394         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3395         if (!frame_pointer_needed)
3396           offset += cfun->machine->frame.frame_size
3397                     - cfun->machine->frame.hard_fp_offset;
3398         /* Check that we can access the stack slot of the register with one
3399            direct load with no adjustments needed.  */
3400         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3401           bitmap_set_bit (components, regno);
3402       }
3403
3404   /* Don't mess with the hard frame pointer.  */
3405   if (frame_pointer_needed)
3406     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3407
3408   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3409   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3410   /* If aarch64_layout_frame has chosen registers to store/restore with
3411      writeback don't interfere with them to avoid having to output explicit
3412      stack adjustment instructions.  */
3413   if (reg2 != INVALID_REGNUM)
3414     bitmap_clear_bit (components, reg2);
3415   if (reg1 != INVALID_REGNUM)
3416     bitmap_clear_bit (components, reg1);
3417
3418   bitmap_clear_bit (components, LR_REGNUM);
3419   bitmap_clear_bit (components, SP_REGNUM);
3420
3421   return components;
3422 }
3423
3424 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3425
3426 static sbitmap
3427 aarch64_components_for_bb (basic_block bb)
3428 {
3429   bitmap in = DF_LIVE_IN (bb);
3430   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3431   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3432
3433   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3434   bitmap_clear (components);
3435
3436   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3437   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3438     if ((!call_used_regs[regno])
3439        && (bitmap_bit_p (in, regno)
3440            || bitmap_bit_p (gen, regno)
3441            || bitmap_bit_p (kill, regno)))
3442           bitmap_set_bit (components, regno);
3443
3444   return components;
3445 }
3446
3447 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3448    Nothing to do for aarch64.  */
3449
3450 static void
3451 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3452 {
3453 }
3454
3455 /* Return the next set bit in BMP from START onwards.  Return the total number
3456    of bits in BMP if no set bit is found at or after START.  */
3457
3458 static unsigned int
3459 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3460 {
3461   unsigned int nbits = SBITMAP_SIZE (bmp);
3462   if (start == nbits)
3463     return start;
3464
3465   gcc_assert (start < nbits);
3466   for (unsigned int i = start; i < nbits; i++)
3467     if (bitmap_bit_p (bmp, i))
3468       return i;
3469
3470   return nbits;
3471 }
3472
3473 /* Do the work for aarch64_emit_prologue_components and
3474    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3475    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3476    for these components or the epilogue sequence.  That is, it determines
3477    whether we should emit stores or loads and what kind of CFA notes to attach
3478    to the insns.  Otherwise the logic for the two sequences is very
3479    similar.  */
3480
3481 static void
3482 aarch64_process_components (sbitmap components, bool prologue_p)
3483 {
3484   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3485                              ? HARD_FRAME_POINTER_REGNUM
3486                              : STACK_POINTER_REGNUM);
3487
3488   unsigned last_regno = SBITMAP_SIZE (components);
3489   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3490   rtx_insn *insn = NULL;
3491
3492   while (regno != last_regno)
3493     {
3494       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3495          so DFmode for the vector registers is enough.  */
3496       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3497       rtx reg = gen_rtx_REG (mode, regno);
3498       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3499       if (!frame_pointer_needed)
3500         offset += cfun->machine->frame.frame_size
3501                   - cfun->machine->frame.hard_fp_offset;
3502       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3503       rtx mem = gen_frame_mem (mode, addr);
3504
3505       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3506       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3507       /* No more registers to handle after REGNO.
3508          Emit a single save/restore and exit.  */
3509       if (regno2 == last_regno)
3510         {
3511           insn = emit_insn (set);
3512           RTX_FRAME_RELATED_P (insn) = 1;
3513           if (prologue_p)
3514             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3515           else
3516             add_reg_note (insn, REG_CFA_RESTORE, reg);
3517           break;
3518         }
3519
3520       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3521       /* The next register is not of the same class or its offset is not
3522          mergeable with the current one into a pair.  */
3523       if (!satisfies_constraint_Ump (mem)
3524           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3525           || (offset2 - cfun->machine->frame.reg_offset[regno])
3526                 != GET_MODE_SIZE (mode))
3527         {
3528           insn = emit_insn (set);
3529           RTX_FRAME_RELATED_P (insn) = 1;
3530           if (prologue_p)
3531             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3532           else
3533             add_reg_note (insn, REG_CFA_RESTORE, reg);
3534
3535           regno = regno2;
3536           continue;
3537         }
3538
3539       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3540       rtx reg2 = gen_rtx_REG (mode, regno2);
3541       if (!frame_pointer_needed)
3542         offset2 += cfun->machine->frame.frame_size
3543                   - cfun->machine->frame.hard_fp_offset;
3544       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3545       rtx mem2 = gen_frame_mem (mode, addr2);
3546       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3547                              : gen_rtx_SET (reg2, mem2);
3548
3549       if (prologue_p)
3550         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3551       else
3552         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3553
3554       RTX_FRAME_RELATED_P (insn) = 1;
3555       if (prologue_p)
3556         {
3557           add_reg_note (insn, REG_CFA_OFFSET, set);
3558           add_reg_note (insn, REG_CFA_OFFSET, set2);
3559         }
3560       else
3561         {
3562           add_reg_note (insn, REG_CFA_RESTORE, reg);
3563           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3564         }
3565
3566       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3567     }
3568 }
3569
3570 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3571
3572 static void
3573 aarch64_emit_prologue_components (sbitmap components)
3574 {
3575   aarch64_process_components (components, true);
3576 }
3577
3578 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3579
3580 static void
3581 aarch64_emit_epilogue_components (sbitmap components)
3582 {
3583   aarch64_process_components (components, false);
3584 }
3585
3586 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3587
3588 static void
3589 aarch64_set_handled_components (sbitmap components)
3590 {
3591   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3592     if (bitmap_bit_p (components, regno))
3593       cfun->machine->reg_is_wrapped_separately[regno] = true;
3594 }
3595
3596 /* AArch64 stack frames generated by this compiler look like:
3597
3598         +-------------------------------+
3599         |                               |
3600         |  incoming stack arguments     |
3601         |                               |
3602         +-------------------------------+
3603         |                               | <-- incoming stack pointer (aligned)
3604         |  callee-allocated save area   |
3605         |  for register varargs         |
3606         |                               |
3607         +-------------------------------+
3608         |  local variables              | <-- frame_pointer_rtx
3609         |                               |
3610         +-------------------------------+
3611         |  padding0                     | \
3612         +-------------------------------+  |
3613         |  callee-saved registers       |  | frame.saved_regs_size
3614         +-------------------------------+  |
3615         |  LR'                          |  |
3616         +-------------------------------+  |
3617         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3618         +-------------------------------+
3619         |  dynamic allocation           |
3620         +-------------------------------+
3621         |  padding                      |
3622         +-------------------------------+
3623         |  outgoing stack arguments     | <-- arg_pointer
3624         |                               |
3625         +-------------------------------+
3626         |                               | <-- stack_pointer_rtx (aligned)
3627
3628    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3629    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3630    unchanged.  */
3631
3632 /* Generate the prologue instructions for entry into a function.
3633    Establish the stack frame by decreasing the stack pointer with a
3634    properly calculated size and, if necessary, create a frame record
3635    filled with the values of LR and previous frame pointer.  The
3636    current FP is also set up if it is in use.  */
3637
3638 void
3639 aarch64_expand_prologue (void)
3640 {
3641   aarch64_layout_frame ();
3642
3643   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3644   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3645   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3646   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3647   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3648   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3649   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3650   rtx_insn *insn;
3651
3652   /* Sign return address for functions.  */
3653   if (aarch64_return_address_signing_enabled ())
3654     {
3655       insn = emit_insn (gen_pacisp ());
3656       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3657       RTX_FRAME_RELATED_P (insn) = 1;
3658     }
3659
3660   if (flag_stack_usage_info)
3661     current_function_static_stack_size = frame_size;
3662
3663   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3664     {
3665       if (crtl->is_leaf && !cfun->calls_alloca)
3666         {
3667           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3668             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3669                                             frame_size - STACK_CHECK_PROTECT);
3670         }
3671       else if (frame_size > 0)
3672         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3673     }
3674
3675   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3676
3677   if (callee_adjust != 0)
3678     aarch64_push_regs (reg1, reg2, callee_adjust);
3679
3680   if (frame_pointer_needed)
3681     {
3682       if (callee_adjust == 0)
3683         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3684                                    R30_REGNUM, false);
3685       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3686                                        stack_pointer_rtx,
3687                                        GEN_INT (callee_offset)));
3688       RTX_FRAME_RELATED_P (insn) = 1;
3689       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3690     }
3691
3692   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3693                              callee_adjust != 0 || frame_pointer_needed);
3694   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3695                              callee_adjust != 0 || frame_pointer_needed);
3696   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3697 }
3698
3699 /* Return TRUE if we can use a simple_return insn.
3700
3701    This function checks whether the callee saved stack is empty, which
3702    means no restore actions are need. The pro_and_epilogue will use
3703    this to check whether shrink-wrapping opt is feasible.  */
3704
3705 bool
3706 aarch64_use_return_insn_p (void)
3707 {
3708   if (!reload_completed)
3709     return false;
3710
3711   if (crtl->profile)
3712     return false;
3713
3714   aarch64_layout_frame ();
3715
3716   return cfun->machine->frame.frame_size == 0;
3717 }
3718
3719 /* Generate the epilogue instructions for returning from a function.
3720    This is almost exactly the reverse of the prolog sequence, except
3721    that we need to insert barriers to avoid scheduling loads that read
3722    from a deallocated stack, and we optimize the unwind records by
3723    emitting them all together if possible.  */
3724 void
3725 aarch64_expand_epilogue (bool for_sibcall)
3726 {
3727   aarch64_layout_frame ();
3728
3729   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3730   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3731   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3732   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3733   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3734   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3735   rtx cfi_ops = NULL;
3736   rtx_insn *insn;
3737
3738   /* We need to add memory barrier to prevent read from deallocated stack.  */
3739   bool need_barrier_p = (get_frame_size ()
3740                          + cfun->machine->frame.saved_varargs_size) != 0;
3741
3742   /* Emit a barrier to prevent loads from a deallocated stack.  */
3743   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3744       || crtl->calls_eh_return)
3745     {
3746       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3747       need_barrier_p = false;
3748     }
3749
3750   /* Restore the stack pointer from the frame pointer if it may not
3751      be the same as the stack pointer.  */
3752   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3753     {
3754       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3755                                        hard_frame_pointer_rtx,
3756                                        GEN_INT (-callee_offset)));
3757       /* If writeback is used when restoring callee-saves, the CFA
3758          is restored on the instruction doing the writeback.  */
3759       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3760     }
3761   else
3762     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3763
3764   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3765                                 callee_adjust != 0, &cfi_ops);
3766   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3767                                 callee_adjust != 0, &cfi_ops);
3768
3769   if (need_barrier_p)
3770     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3771
3772   if (callee_adjust != 0)
3773     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3774
3775   if (callee_adjust != 0 || initial_adjust > 65536)
3776     {
3777       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3778       insn = get_last_insn ();
3779       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3780       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3781       RTX_FRAME_RELATED_P (insn) = 1;
3782       cfi_ops = NULL;
3783     }
3784
3785   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3786
3787   if (cfi_ops)
3788     {
3789       /* Emit delayed restores and reset the CFA to be SP.  */
3790       insn = get_last_insn ();
3791       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3792       REG_NOTES (insn) = cfi_ops;
3793       RTX_FRAME_RELATED_P (insn) = 1;
3794     }
3795
3796   /* We prefer to emit the combined return/authenticate instruction RETAA,
3797      however there are three cases in which we must instead emit an explicit
3798      authentication instruction.
3799
3800         1) Sibcalls don't return in a normal way, so if we're about to call one
3801            we must authenticate.
3802
3803         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3804            generating code for !TARGET_ARMV8_3 we can't use it and must
3805            explicitly authenticate.
3806
3807         3) On an eh_return path we make extra stack adjustments to update the
3808            canonical frame address to be the exception handler's CFA.  We want
3809            to authenticate using the CFA of the function which calls eh_return.
3810     */
3811   if (aarch64_return_address_signing_enabled ()
3812       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3813     {
3814       insn = emit_insn (gen_autisp ());
3815       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3816       RTX_FRAME_RELATED_P (insn) = 1;
3817     }
3818
3819   /* Stack adjustment for exception handler.  */
3820   if (crtl->calls_eh_return)
3821     {
3822       /* We need to unwind the stack by the offset computed by
3823          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3824          to be SP; letting the CFA move during this adjustment
3825          is just as correct as retaining the CFA from the body
3826          of the function.  Therefore, do nothing special.  */
3827       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3828     }
3829
3830   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3831   if (!for_sibcall)
3832     emit_jump_insn (ret_rtx);
3833 }
3834
3835 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3836    normally or return to a previous frame after unwinding.
3837
3838    An EH return uses a single shared return sequence.  The epilogue is
3839    exactly like a normal epilogue except that it has an extra input
3840    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3841    that must be applied after the frame has been destroyed.  An extra label
3842    is inserted before the epilogue which initializes this register to zero,
3843    and this is the entry point for a normal return.
3844
3845    An actual EH return updates the return address, initializes the stack
3846    adjustment and jumps directly into the epilogue (bypassing the zeroing
3847    of the adjustment).  Since the return address is typically saved on the
3848    stack when a function makes a call, the saved LR must be updated outside
3849    the epilogue.
3850
3851    This poses problems as the store is generated well before the epilogue,
3852    so the offset of LR is not known yet.  Also optimizations will remove the
3853    store as it appears dead, even after the epilogue is generated (as the
3854    base or offset for loading LR is different in many cases).
3855
3856    To avoid these problems this implementation forces the frame pointer
3857    in eh_return functions so that the location of LR is fixed and known early.
3858    It also marks the store volatile, so no optimization is permitted to
3859    remove the store.  */
3860 rtx
3861 aarch64_eh_return_handler_rtx (void)
3862 {
3863   rtx tmp = gen_frame_mem (Pmode,
3864     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3865
3866   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3867   MEM_VOLATILE_P (tmp) = true;
3868   return tmp;
3869 }
3870
3871 /* Output code to add DELTA to the first argument, and then jump
3872    to FUNCTION.  Used for C++ multiple inheritance.  */
3873 static void
3874 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3875                          HOST_WIDE_INT delta,
3876                          HOST_WIDE_INT vcall_offset,
3877                          tree function)
3878 {
3879   /* The this pointer is always in x0.  Note that this differs from
3880      Arm where the this pointer maybe bumped to r1 if r0 is required
3881      to return a pointer to an aggregate.  On AArch64 a result value
3882      pointer will be in x8.  */
3883   int this_regno = R0_REGNUM;
3884   rtx this_rtx, temp0, temp1, addr, funexp;
3885   rtx_insn *insn;
3886
3887   reload_completed = 1;
3888   emit_note (NOTE_INSN_PROLOGUE_END);
3889
3890   if (vcall_offset == 0)
3891     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3892   else
3893     {
3894       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3895
3896       this_rtx = gen_rtx_REG (Pmode, this_regno);
3897       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3898       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3899
3900       addr = this_rtx;
3901       if (delta != 0)
3902         {
3903           if (delta >= -256 && delta < 256)
3904             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3905                                        plus_constant (Pmode, this_rtx, delta));
3906           else
3907             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3908         }
3909
3910       if (Pmode == ptr_mode)
3911         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3912       else
3913         aarch64_emit_move (temp0,
3914                            gen_rtx_ZERO_EXTEND (Pmode,
3915                                                 gen_rtx_MEM (ptr_mode, addr)));
3916
3917       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3918           addr = plus_constant (Pmode, temp0, vcall_offset);
3919       else
3920         {
3921           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3922                                           Pmode);
3923           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3924         }
3925
3926       if (Pmode == ptr_mode)
3927         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3928       else
3929         aarch64_emit_move (temp1,
3930                            gen_rtx_SIGN_EXTEND (Pmode,
3931                                                 gen_rtx_MEM (ptr_mode, addr)));
3932
3933       emit_insn (gen_add2_insn (this_rtx, temp1));
3934     }
3935
3936   /* Generate a tail call to the target function.  */
3937   if (!TREE_USED (function))
3938     {
3939       assemble_external (function);
3940       TREE_USED (function) = 1;
3941     }
3942   funexp = XEXP (DECL_RTL (function), 0);
3943   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3944   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3945   SIBLING_CALL_P (insn) = 1;
3946
3947   insn = get_insns ();
3948   shorten_branches (insn);
3949   final_start_function (insn, file, 1);
3950   final (insn, file, 1);
3951   final_end_function ();
3952
3953   /* Stop pretending to be a post-reload pass.  */
3954   reload_completed = 0;
3955 }
3956
3957 static bool
3958 aarch64_tls_referenced_p (rtx x)
3959 {
3960   if (!TARGET_HAVE_TLS)
3961     return false;
3962   subrtx_iterator::array_type array;
3963   FOR_EACH_SUBRTX (iter, array, x, ALL)
3964     {
3965       const_rtx x = *iter;
3966       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3967         return true;
3968       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3969          TLS offsets, not real symbol references.  */
3970       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3971         iter.skip_subrtxes ();
3972     }
3973   return false;
3974 }
3975
3976
3977 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3978    a left shift of 0 or 12 bits.  */
3979 bool
3980 aarch64_uimm12_shift (HOST_WIDE_INT val)
3981 {
3982   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3983           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3984           );
3985 }
3986
3987
3988 /* Return true if val is an immediate that can be loaded into a
3989    register by a MOVZ instruction.  */
3990 static bool
3991 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3992 {
3993   if (GET_MODE_SIZE (mode) > 4)
3994     {
3995       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3996           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3997         return 1;
3998     }
3999   else
4000     {
4001       /* Ignore sign extension.  */
4002       val &= (HOST_WIDE_INT) 0xffffffff;
4003     }
4004   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4005           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4006 }
4007
4008 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4009
4010 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4011   {
4012     0x0000000100000001ull,
4013     0x0001000100010001ull,
4014     0x0101010101010101ull,
4015     0x1111111111111111ull,
4016     0x5555555555555555ull,
4017   };
4018
4019
4020 /* Return true if val is a valid bitmask immediate.  */
4021
4022 bool
4023 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4024 {
4025   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4026   int bits;
4027
4028   /* Check for a single sequence of one bits and return quickly if so.
4029      The special cases of all ones and all zeroes returns false.  */
4030   val = (unsigned HOST_WIDE_INT) val_in;
4031   tmp = val + (val & -val);
4032
4033   if (tmp == (tmp & -tmp))
4034     return (val + 1) > 1;
4035
4036   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4037   if (mode == SImode)
4038     val = (val << 32) | (val & 0xffffffff);
4039
4040   /* Invert if the immediate doesn't start with a zero bit - this means we
4041      only need to search for sequences of one bits.  */
4042   if (val & 1)
4043     val = ~val;
4044
4045   /* Find the first set bit and set tmp to val with the first sequence of one
4046      bits removed.  Return success if there is a single sequence of ones.  */
4047   first_one = val & -val;
4048   tmp = val & (val + first_one);
4049
4050   if (tmp == 0)
4051     return true;
4052
4053   /* Find the next set bit and compute the difference in bit position.  */
4054   next_one = tmp & -tmp;
4055   bits = clz_hwi (first_one) - clz_hwi (next_one);
4056   mask = val ^ tmp;
4057
4058   /* Check the bit position difference is a power of 2, and that the first
4059      sequence of one bits fits within 'bits' bits.  */
4060   if ((mask >> bits) != 0 || bits != (bits & -bits))
4061     return false;
4062
4063   /* Check the sequence of one bits is repeated 64/bits times.  */
4064   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4065 }
4066
4067 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4068    Assumed precondition: VAL_IN Is not zero.  */
4069
4070 unsigned HOST_WIDE_INT
4071 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4072 {
4073   int lowest_bit_set = ctz_hwi (val_in);
4074   int highest_bit_set = floor_log2 (val_in);
4075   gcc_assert (val_in != 0);
4076
4077   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4078           (HOST_WIDE_INT_1U << lowest_bit_set));
4079 }
4080
4081 /* Create constant where bits outside of lowest bit set to highest bit set
4082    are set to 1.  */
4083
4084 unsigned HOST_WIDE_INT
4085 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4086 {
4087   return val_in | ~aarch64_and_split_imm1 (val_in);
4088 }
4089
4090 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4091
4092 bool
4093 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4094 {
4095   if (aarch64_bitmask_imm (val_in, mode))
4096     return false;
4097
4098   if (aarch64_move_imm (val_in, mode))
4099     return false;
4100
4101   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4102
4103   return aarch64_bitmask_imm (imm2, mode);
4104 }
4105
4106 /* Return true if val is an immediate that can be loaded into a
4107    register in a single instruction.  */
4108 bool
4109 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4110 {
4111   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4112     return 1;
4113   return aarch64_bitmask_imm (val, mode);
4114 }
4115
4116 static bool
4117 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4118 {
4119   rtx base, offset;
4120
4121   if (GET_CODE (x) == HIGH)
4122     return true;
4123
4124   split_const (x, &base, &offset);
4125   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4126     {
4127       if (aarch64_classify_symbol (base, offset)
4128           != SYMBOL_FORCE_TO_MEM)
4129         return true;
4130       else
4131         /* Avoid generating a 64-bit relocation in ILP32; leave
4132            to aarch64_expand_mov_immediate to handle it properly.  */
4133         return mode != ptr_mode;
4134     }
4135
4136   return aarch64_tls_referenced_p (x);
4137 }
4138
4139 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4140    The expansion for a table switch is quite expensive due to the number
4141    of instructions, the table lookup and hard to predict indirect jump.
4142    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4143    set, otherwise use tables for > 16 cases as a tradeoff between size and
4144    performance.  When optimizing for size, use the default setting.  */
4145
4146 static unsigned int
4147 aarch64_case_values_threshold (void)
4148 {
4149   /* Use the specified limit for the number of cases before using jump
4150      tables at higher optimization levels.  */
4151   if (optimize > 2
4152       && selected_cpu->tune->max_case_values != 0)
4153     return selected_cpu->tune->max_case_values;
4154   else
4155     return optimize_size ? default_case_values_threshold () : 17;
4156 }
4157
4158 /* Return true if register REGNO is a valid index register.
4159    STRICT_P is true if REG_OK_STRICT is in effect.  */
4160
4161 bool
4162 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4163 {
4164   if (!HARD_REGISTER_NUM_P (regno))
4165     {
4166       if (!strict_p)
4167         return true;
4168
4169       if (!reg_renumber)
4170         return false;
4171
4172       regno = reg_renumber[regno];
4173     }
4174   return GP_REGNUM_P (regno);
4175 }
4176
4177 /* Return true if register REGNO is a valid base register for mode MODE.
4178    STRICT_P is true if REG_OK_STRICT is in effect.  */
4179
4180 bool
4181 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4182 {
4183   if (!HARD_REGISTER_NUM_P (regno))
4184     {
4185       if (!strict_p)
4186         return true;
4187
4188       if (!reg_renumber)
4189         return false;
4190
4191       regno = reg_renumber[regno];
4192     }
4193
4194   /* The fake registers will be eliminated to either the stack or
4195      hard frame pointer, both of which are usually valid base registers.
4196      Reload deals with the cases where the eliminated form isn't valid.  */
4197   return (GP_REGNUM_P (regno)
4198           || regno == SP_REGNUM
4199           || regno == FRAME_POINTER_REGNUM
4200           || regno == ARG_POINTER_REGNUM);
4201 }
4202
4203 /* Return true if X is a valid base register for mode MODE.
4204    STRICT_P is true if REG_OK_STRICT is in effect.  */
4205
4206 static bool
4207 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4208 {
4209   if (!strict_p
4210       && GET_CODE (x) == SUBREG
4211       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4212     x = SUBREG_REG (x);
4213
4214   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4215 }
4216
4217 /* Return true if address offset is a valid index.  If it is, fill in INFO
4218    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4219
4220 static bool
4221 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4222                         machine_mode mode, bool strict_p)
4223 {
4224   enum aarch64_address_type type;
4225   rtx index;
4226   int shift;
4227
4228   /* (reg:P) */
4229   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4230       && GET_MODE (x) == Pmode)
4231     {
4232       type = ADDRESS_REG_REG;
4233       index = x;
4234       shift = 0;
4235     }
4236   /* (sign_extend:DI (reg:SI)) */
4237   else if ((GET_CODE (x) == SIGN_EXTEND
4238             || GET_CODE (x) == ZERO_EXTEND)
4239            && GET_MODE (x) == DImode
4240            && GET_MODE (XEXP (x, 0)) == SImode)
4241     {
4242       type = (GET_CODE (x) == SIGN_EXTEND)
4243         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4244       index = XEXP (x, 0);
4245       shift = 0;
4246     }
4247   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4248   else if (GET_CODE (x) == MULT
4249            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4250                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4251            && GET_MODE (XEXP (x, 0)) == DImode
4252            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4253            && CONST_INT_P (XEXP (x, 1)))
4254     {
4255       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4256         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4257       index = XEXP (XEXP (x, 0), 0);
4258       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4259     }
4260   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4261   else if (GET_CODE (x) == ASHIFT
4262            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4263                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4264            && GET_MODE (XEXP (x, 0)) == DImode
4265            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4266            && CONST_INT_P (XEXP (x, 1)))
4267     {
4268       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4269         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4270       index = XEXP (XEXP (x, 0), 0);
4271       shift = INTVAL (XEXP (x, 1));
4272     }
4273   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4274   else if ((GET_CODE (x) == SIGN_EXTRACT
4275             || GET_CODE (x) == ZERO_EXTRACT)
4276            && GET_MODE (x) == DImode
4277            && GET_CODE (XEXP (x, 0)) == MULT
4278            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4279            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4280     {
4281       type = (GET_CODE (x) == SIGN_EXTRACT)
4282         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4283       index = XEXP (XEXP (x, 0), 0);
4284       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4285       if (INTVAL (XEXP (x, 1)) != 32 + shift
4286           || INTVAL (XEXP (x, 2)) != 0)
4287         shift = -1;
4288     }
4289   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4290      (const_int 0xffffffff<<shift)) */
4291   else if (GET_CODE (x) == AND
4292            && GET_MODE (x) == DImode
4293            && GET_CODE (XEXP (x, 0)) == MULT
4294            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4295            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4296            && CONST_INT_P (XEXP (x, 1)))
4297     {
4298       type = ADDRESS_REG_UXTW;
4299       index = XEXP (XEXP (x, 0), 0);
4300       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4301       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4302         shift = -1;
4303     }
4304   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4305   else if ((GET_CODE (x) == SIGN_EXTRACT
4306             || GET_CODE (x) == ZERO_EXTRACT)
4307            && GET_MODE (x) == DImode
4308            && GET_CODE (XEXP (x, 0)) == ASHIFT
4309            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4310            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4311     {
4312       type = (GET_CODE (x) == SIGN_EXTRACT)
4313         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4314       index = XEXP (XEXP (x, 0), 0);
4315       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4316       if (INTVAL (XEXP (x, 1)) != 32 + shift
4317           || INTVAL (XEXP (x, 2)) != 0)
4318         shift = -1;
4319     }
4320   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4321      (const_int 0xffffffff<<shift)) */
4322   else if (GET_CODE (x) == AND
4323            && GET_MODE (x) == DImode
4324            && GET_CODE (XEXP (x, 0)) == ASHIFT
4325            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4326            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4327            && CONST_INT_P (XEXP (x, 1)))
4328     {
4329       type = ADDRESS_REG_UXTW;
4330       index = XEXP (XEXP (x, 0), 0);
4331       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4332       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4333         shift = -1;
4334     }
4335   /* (mult:P (reg:P) (const_int scale)) */
4336   else if (GET_CODE (x) == MULT
4337            && GET_MODE (x) == Pmode
4338            && GET_MODE (XEXP (x, 0)) == Pmode
4339            && CONST_INT_P (XEXP (x, 1)))
4340     {
4341       type = ADDRESS_REG_REG;
4342       index = XEXP (x, 0);
4343       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4344     }
4345   /* (ashift:P (reg:P) (const_int shift)) */
4346   else if (GET_CODE (x) == ASHIFT
4347            && GET_MODE (x) == Pmode
4348            && GET_MODE (XEXP (x, 0)) == Pmode
4349            && CONST_INT_P (XEXP (x, 1)))
4350     {
4351       type = ADDRESS_REG_REG;
4352       index = XEXP (x, 0);
4353       shift = INTVAL (XEXP (x, 1));
4354     }
4355   else
4356     return false;
4357
4358   if (!strict_p
4359       && GET_CODE (index) == SUBREG
4360       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4361     index = SUBREG_REG (index);
4362
4363   if ((shift == 0 ||
4364        (shift > 0 && shift <= 3
4365         && (1 << shift) == GET_MODE_SIZE (mode)))
4366       && REG_P (index)
4367       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4368     {
4369       info->type = type;
4370       info->offset = index;
4371       info->shift = shift;
4372       return true;
4373     }
4374
4375   return false;
4376 }
4377
4378 /* Return true if MODE is one of the modes for which we
4379    support LDP/STP operations.  */
4380
4381 static bool
4382 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4383 {
4384   return mode == SImode || mode == DImode
4385          || mode == SFmode || mode == DFmode
4386          || (aarch64_vector_mode_supported_p (mode)
4387              && GET_MODE_SIZE (mode) == 8);
4388 }
4389
4390 /* Return true if REGNO is a virtual pointer register, or an eliminable
4391    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4392    include stack_pointer or hard_frame_pointer.  */
4393 static bool
4394 virt_or_elim_regno_p (unsigned regno)
4395 {
4396   return ((regno >= FIRST_VIRTUAL_REGISTER
4397            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4398           || regno == FRAME_POINTER_REGNUM
4399           || regno == ARG_POINTER_REGNUM);
4400 }
4401
4402 /* Return true if X is a valid address for machine mode MODE.  If it is,
4403    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4404    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4405
4406 static bool
4407 aarch64_classify_address (struct aarch64_address_info *info,
4408                           rtx x, machine_mode mode,
4409                           RTX_CODE outer_code, bool strict_p)
4410 {
4411   enum rtx_code code = GET_CODE (x);
4412   rtx op0, op1;
4413
4414   /* On BE, we use load/store pair for all large int mode load/stores.
4415      TI/TFmode may also use a load/store pair.  */
4416   bool load_store_pair_p = (outer_code == PARALLEL
4417                             || mode == TImode
4418                             || mode == TFmode
4419                             || (BYTES_BIG_ENDIAN
4420                                 && aarch64_vect_struct_mode_p (mode)));
4421
4422   bool allow_reg_index_p =
4423     !load_store_pair_p
4424     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4425     && !aarch64_vect_struct_mode_p (mode);
4426
4427   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4428      REG addressing.  */
4429   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4430       && (code != POST_INC && code != REG))
4431     return false;
4432
4433   switch (code)
4434     {
4435     case REG:
4436     case SUBREG:
4437       info->type = ADDRESS_REG_IMM;
4438       info->base = x;
4439       info->offset = const0_rtx;
4440       return aarch64_base_register_rtx_p (x, strict_p);
4441
4442     case PLUS:
4443       op0 = XEXP (x, 0);
4444       op1 = XEXP (x, 1);
4445
4446       if (! strict_p
4447           && REG_P (op0)
4448           && virt_or_elim_regno_p (REGNO (op0))
4449           && CONST_INT_P (op1))
4450         {
4451           info->type = ADDRESS_REG_IMM;
4452           info->base = op0;
4453           info->offset = op1;
4454
4455           return true;
4456         }
4457
4458       if (GET_MODE_SIZE (mode) != 0
4459           && CONST_INT_P (op1)
4460           && aarch64_base_register_rtx_p (op0, strict_p))
4461         {
4462           HOST_WIDE_INT offset = INTVAL (op1);
4463
4464           info->type = ADDRESS_REG_IMM;
4465           info->base = op0;
4466           info->offset = op1;
4467
4468           /* TImode and TFmode values are allowed in both pairs of X
4469              registers and individual Q registers.  The available
4470              address modes are:
4471              X,X: 7-bit signed scaled offset
4472              Q:   9-bit signed offset
4473              We conservatively require an offset representable in either mode.
4474              When performing the check for pairs of X registers i.e.  LDP/STP
4475              pass down DImode since that is the natural size of the LDP/STP
4476              instruction memory accesses.  */
4477           if (mode == TImode || mode == TFmode)
4478             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4479                     && (offset_9bit_signed_unscaled_p (mode, offset)
4480                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4481
4482           /* A 7bit offset check because OImode will emit a ldp/stp
4483              instruction (only big endian will get here).
4484              For ldp/stp instructions, the offset is scaled for the size of a
4485              single element of the pair.  */
4486           if (mode == OImode)
4487             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4488
4489           /* Three 9/12 bit offsets checks because CImode will emit three
4490              ldr/str instructions (only big endian will get here).  */
4491           if (mode == CImode)
4492             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4493                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4494                         || offset_12bit_unsigned_scaled_p (V16QImode,
4495                                                            offset + 32)));
4496
4497           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4498              instructions (only big endian will get here).  */
4499           if (mode == XImode)
4500             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4501                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4502                                                             offset + 32));
4503
4504           if (load_store_pair_p)
4505             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4506                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4507           else
4508             return (offset_9bit_signed_unscaled_p (mode, offset)
4509                     || offset_12bit_unsigned_scaled_p (mode, offset));
4510         }
4511
4512       if (allow_reg_index_p)
4513         {
4514           /* Look for base + (scaled/extended) index register.  */
4515           if (aarch64_base_register_rtx_p (op0, strict_p)
4516               && aarch64_classify_index (info, op1, mode, strict_p))
4517             {
4518               info->base = op0;
4519               return true;
4520             }
4521           if (aarch64_base_register_rtx_p (op1, strict_p)
4522               && aarch64_classify_index (info, op0, mode, strict_p))
4523             {
4524               info->base = op1;
4525               return true;
4526             }
4527         }
4528
4529       return false;
4530
4531     case POST_INC:
4532     case POST_DEC:
4533     case PRE_INC:
4534     case PRE_DEC:
4535       info->type = ADDRESS_REG_WB;
4536       info->base = XEXP (x, 0);
4537       info->offset = NULL_RTX;
4538       return aarch64_base_register_rtx_p (info->base, strict_p);
4539
4540     case POST_MODIFY:
4541     case PRE_MODIFY:
4542       info->type = ADDRESS_REG_WB;
4543       info->base = XEXP (x, 0);
4544       if (GET_CODE (XEXP (x, 1)) == PLUS
4545           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4546           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4547           && aarch64_base_register_rtx_p (info->base, strict_p))
4548         {
4549           HOST_WIDE_INT offset;
4550           info->offset = XEXP (XEXP (x, 1), 1);
4551           offset = INTVAL (info->offset);
4552
4553           /* TImode and TFmode values are allowed in both pairs of X
4554              registers and individual Q registers.  The available
4555              address modes are:
4556              X,X: 7-bit signed scaled offset
4557              Q:   9-bit signed offset
4558              We conservatively require an offset representable in either mode.
4559            */
4560           if (mode == TImode || mode == TFmode)
4561             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4562                     && offset_9bit_signed_unscaled_p (mode, offset));
4563
4564           if (load_store_pair_p)
4565             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4566                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4567           else
4568             return offset_9bit_signed_unscaled_p (mode, offset);
4569         }
4570       return false;
4571
4572     case CONST:
4573     case SYMBOL_REF:
4574     case LABEL_REF:
4575       /* load literal: pc-relative constant pool entry.  Only supported
4576          for SI mode or larger.  */
4577       info->type = ADDRESS_SYMBOLIC;
4578
4579       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4580         {
4581           rtx sym, addend;
4582
4583           split_const (x, &sym, &addend);
4584           return ((GET_CODE (sym) == LABEL_REF
4585                    || (GET_CODE (sym) == SYMBOL_REF
4586                        && CONSTANT_POOL_ADDRESS_P (sym)
4587                        && aarch64_pcrelative_literal_loads)));
4588         }
4589       return false;
4590
4591     case LO_SUM:
4592       info->type = ADDRESS_LO_SUM;
4593       info->base = XEXP (x, 0);
4594       info->offset = XEXP (x, 1);
4595       if (allow_reg_index_p
4596           && aarch64_base_register_rtx_p (info->base, strict_p))
4597         {
4598           rtx sym, offs;
4599           split_const (info->offset, &sym, &offs);
4600           if (GET_CODE (sym) == SYMBOL_REF
4601               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4602             {
4603               /* The symbol and offset must be aligned to the access size.  */
4604               unsigned int align;
4605               unsigned int ref_size;
4606
4607               if (CONSTANT_POOL_ADDRESS_P (sym))
4608                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4609               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4610                 {
4611                   tree exp = SYMBOL_REF_DECL (sym);
4612                   align = TYPE_ALIGN (TREE_TYPE (exp));
4613                   align = CONSTANT_ALIGNMENT (exp, align);
4614                 }
4615               else if (SYMBOL_REF_DECL (sym))
4616                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4617               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4618                        && SYMBOL_REF_BLOCK (sym) != NULL)
4619                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4620               else
4621                 align = BITS_PER_UNIT;
4622
4623               ref_size = GET_MODE_SIZE (mode);
4624               if (ref_size == 0)
4625                 ref_size = GET_MODE_SIZE (DImode);
4626
4627               return ((INTVAL (offs) & (ref_size - 1)) == 0
4628                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4629             }
4630         }
4631       return false;
4632
4633     default:
4634       return false;
4635     }
4636 }
4637
4638 /* Return true if the address X is valid for a PRFM instruction.
4639    STRICT_P is true if we should do strict checking with
4640    aarch64_classify_address.  */
4641
4642 bool
4643 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4644 {
4645   struct aarch64_address_info addr;
4646
4647   /* PRFM accepts the same addresses as DImode...  */
4648   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4649   if (!res)
4650     return false;
4651
4652   /* ... except writeback forms.  */
4653   return addr.type != ADDRESS_REG_WB;
4654 }
4655
4656 bool
4657 aarch64_symbolic_address_p (rtx x)
4658 {
4659   rtx offset;
4660
4661   split_const (x, &x, &offset);
4662   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4663 }
4664
4665 /* Classify the base of symbolic expression X.  */
4666
4667 enum aarch64_symbol_type
4668 aarch64_classify_symbolic_expression (rtx x)
4669 {
4670   rtx offset;
4671
4672   split_const (x, &x, &offset);
4673   return aarch64_classify_symbol (x, offset);
4674 }
4675
4676
4677 /* Return TRUE if X is a legitimate address for accessing memory in
4678    mode MODE.  */
4679 static bool
4680 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4681 {
4682   struct aarch64_address_info addr;
4683
4684   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4685 }
4686
4687 /* Return TRUE if X is a legitimate address for accessing memory in
4688    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4689    pair operation.  */
4690 bool
4691 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4692                               RTX_CODE outer_code, bool strict_p)
4693 {
4694   struct aarch64_address_info addr;
4695
4696   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4697 }
4698
4699 /* Split an out-of-range address displacement into a base and offset.
4700    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4701    to increase opportunities for sharing the base address of different sizes.
4702    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4703 static bool
4704 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4705 {
4706   HOST_WIDE_INT offset = INTVAL (*disp);
4707   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4708
4709   if (mode == TImode || mode == TFmode
4710       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4711     base = (offset + 0x100) & ~0x1ff;
4712
4713   *off = GEN_INT (base);
4714   *disp = GEN_INT (offset - base);
4715   return true;
4716 }
4717
4718 /* Return the binary representation of floating point constant VALUE in INTVAL.
4719    If the value cannot be converted, return false without setting INTVAL.
4720    The conversion is done in the given MODE.  */
4721 bool
4722 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4723 {
4724
4725   /* We make a general exception for 0.  */
4726   if (aarch64_float_const_zero_rtx_p (value))
4727     {
4728       *intval = 0;
4729       return true;
4730     }
4731
4732   machine_mode mode = GET_MODE (value);
4733   if (GET_CODE (value) != CONST_DOUBLE
4734       || !SCALAR_FLOAT_MODE_P (mode)
4735       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4736       /* Only support up to DF mode.  */
4737       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4738     return false;
4739
4740   unsigned HOST_WIDE_INT ival = 0;
4741
4742   long res[2];
4743   real_to_target (res,
4744                   CONST_DOUBLE_REAL_VALUE (value),
4745                   REAL_MODE_FORMAT (mode));
4746
4747   if (mode == DFmode)
4748     {
4749       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4750       ival = zext_hwi (res[order], 32);
4751       ival |= (zext_hwi (res[1 - order], 32) << 32);
4752     }
4753   else
4754       ival = zext_hwi (res[0], 32);
4755
4756   *intval = ival;
4757   return true;
4758 }
4759
4760 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4761    single MOV(+MOVK) followed by an FMOV.  */
4762 bool
4763 aarch64_float_const_rtx_p (rtx x)
4764 {
4765   machine_mode mode = GET_MODE (x);
4766   if (mode == VOIDmode)
4767     return false;
4768
4769   /* Determine whether it's cheaper to write float constants as
4770      mov/movk pairs over ldr/adrp pairs.  */
4771   unsigned HOST_WIDE_INT ival;
4772
4773   if (GET_CODE (x) == CONST_DOUBLE
4774       && SCALAR_FLOAT_MODE_P (mode)
4775       && aarch64_reinterpret_float_as_int (x, &ival))
4776     {
4777       machine_mode imode = (mode == HFmode
4778                             ? SImode
4779                             : int_mode_for_mode (mode).require ());
4780       int num_instr = aarch64_internal_mov_immediate
4781                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4782       return num_instr < 3;
4783     }
4784
4785   return false;
4786 }
4787
4788 /* Return TRUE if rtx X is immediate constant 0.0 */
4789 bool
4790 aarch64_float_const_zero_rtx_p (rtx x)
4791 {
4792   if (GET_MODE (x) == VOIDmode)
4793     return false;
4794
4795   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4796     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4797   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4798 }
4799
4800 /* Return TRUE if rtx X is immediate constant that fits in a single
4801    MOVI immediate operation.  */
4802 bool
4803 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4804 {
4805   if (!TARGET_SIMD)
4806      return false;
4807
4808   machine_mode vmode, imode;
4809   unsigned HOST_WIDE_INT ival;
4810
4811   if (GET_CODE (x) == CONST_DOUBLE
4812       && SCALAR_FLOAT_MODE_P (mode))
4813     {
4814       if (!aarch64_reinterpret_float_as_int (x, &ival))
4815         return false;
4816
4817       /* We make a general exception for 0.  */
4818       if (aarch64_float_const_zero_rtx_p (x))
4819         return true;
4820
4821       imode = int_mode_for_mode (mode).require ();
4822     }
4823   else if (GET_CODE (x) == CONST_INT
4824            && SCALAR_INT_MODE_P (mode))
4825     {
4826        imode = mode;
4827        ival = INTVAL (x);
4828     }
4829   else
4830     return false;
4831
4832    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4833      a 128 bit vector mode.  */
4834   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4835
4836   vmode = aarch64_simd_container_mode (imode, width);
4837   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4838
4839   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4840 }
4841
4842
4843 /* Return the fixed registers used for condition codes.  */
4844
4845 static bool
4846 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4847 {
4848   *p1 = CC_REGNUM;
4849   *p2 = INVALID_REGNUM;
4850   return true;
4851 }
4852
4853 /* This function is used by the call expanders of the machine description.
4854    RESULT is the register in which the result is returned.  It's NULL for
4855    "call" and "sibcall".
4856    MEM is the location of the function call.
4857    SIBCALL indicates whether this function call is normal call or sibling call.
4858    It will generate different pattern accordingly.  */
4859
4860 void
4861 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4862 {
4863   rtx call, callee, tmp;
4864   rtvec vec;
4865   machine_mode mode;
4866
4867   gcc_assert (MEM_P (mem));
4868   callee = XEXP (mem, 0);
4869   mode = GET_MODE (callee);
4870   gcc_assert (mode == Pmode);
4871
4872   /* Decide if we should generate indirect calls by loading the
4873      address of the callee into a register before performing
4874      the branch-and-link.  */
4875   if (SYMBOL_REF_P (callee)
4876       ? (aarch64_is_long_call_p (callee)
4877          || aarch64_is_noplt_call_p (callee))
4878       : !REG_P (callee))
4879     XEXP (mem, 0) = force_reg (mode, callee);
4880
4881   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4882
4883   if (result != NULL_RTX)
4884     call = gen_rtx_SET (result, call);
4885
4886   if (sibcall)
4887     tmp = ret_rtx;
4888   else
4889     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4890
4891   vec = gen_rtvec (2, call, tmp);
4892   call = gen_rtx_PARALLEL (VOIDmode, vec);
4893
4894   aarch64_emit_call_insn (call);
4895 }
4896
4897 /* Emit call insn with PAT and do aarch64-specific handling.  */
4898
4899 void
4900 aarch64_emit_call_insn (rtx pat)
4901 {
4902   rtx insn = emit_call_insn (pat);
4903
4904   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4905   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4906   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4907 }
4908
4909 machine_mode
4910 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4911 {
4912   /* All floating point compares return CCFP if it is an equality
4913      comparison, and CCFPE otherwise.  */
4914   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4915     {
4916       switch (code)
4917         {
4918         case EQ:
4919         case NE:
4920         case UNORDERED:
4921         case ORDERED:
4922         case UNLT:
4923         case UNLE:
4924         case UNGT:
4925         case UNGE:
4926         case UNEQ:
4927         case LTGT:
4928           return CCFPmode;
4929
4930         case LT:
4931         case LE:
4932         case GT:
4933         case GE:
4934           return CCFPEmode;
4935
4936         default:
4937           gcc_unreachable ();
4938         }
4939     }
4940
4941   /* Equality comparisons of short modes against zero can be performed
4942      using the TST instruction with the appropriate bitmask.  */
4943   if (y == const0_rtx && REG_P (x)
4944       && (code == EQ || code == NE)
4945       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4946     return CC_NZmode;
4947
4948   /* Similarly, comparisons of zero_extends from shorter modes can
4949      be performed using an ANDS with an immediate mask.  */
4950   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4951       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4952       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4953       && (code == EQ || code == NE))
4954     return CC_NZmode;
4955
4956   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4957       && y == const0_rtx
4958       && (code == EQ || code == NE || code == LT || code == GE)
4959       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4960           || GET_CODE (x) == NEG
4961           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4962               && CONST_INT_P (XEXP (x, 2)))))
4963     return CC_NZmode;
4964
4965   /* A compare with a shifted operand.  Because of canonicalization,
4966      the comparison will have to be swapped when we emit the assembly
4967      code.  */
4968   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4969       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4970       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4971           || GET_CODE (x) == LSHIFTRT
4972           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4973     return CC_SWPmode;
4974
4975   /* Similarly for a negated operand, but we can only do this for
4976      equalities.  */
4977   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4978       && (REG_P (y) || GET_CODE (y) == SUBREG)
4979       && (code == EQ || code == NE)
4980       && GET_CODE (x) == NEG)
4981     return CC_Zmode;
4982
4983   /* A test for unsigned overflow.  */
4984   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4985       && code == NE
4986       && GET_CODE (x) == PLUS
4987       && GET_CODE (y) == ZERO_EXTEND)
4988     return CC_Cmode;
4989
4990   /* For everything else, return CCmode.  */
4991   return CCmode;
4992 }
4993
4994 static int
4995 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4996
4997 int
4998 aarch64_get_condition_code (rtx x)
4999 {
5000   machine_mode mode = GET_MODE (XEXP (x, 0));
5001   enum rtx_code comp_code = GET_CODE (x);
5002
5003   if (GET_MODE_CLASS (mode) != MODE_CC)
5004     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5005   return aarch64_get_condition_code_1 (mode, comp_code);
5006 }
5007
5008 static int
5009 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5010 {
5011   switch (mode)
5012     {
5013     case E_CCFPmode:
5014     case E_CCFPEmode:
5015       switch (comp_code)
5016         {
5017         case GE: return AARCH64_GE;
5018         case GT: return AARCH64_GT;
5019         case LE: return AARCH64_LS;
5020         case LT: return AARCH64_MI;
5021         case NE: return AARCH64_NE;
5022         case EQ: return AARCH64_EQ;
5023         case ORDERED: return AARCH64_VC;
5024         case UNORDERED: return AARCH64_VS;
5025         case UNLT: return AARCH64_LT;
5026         case UNLE: return AARCH64_LE;
5027         case UNGT: return AARCH64_HI;
5028         case UNGE: return AARCH64_PL;
5029         default: return -1;
5030         }
5031       break;
5032
5033     case E_CCmode:
5034       switch (comp_code)
5035         {
5036         case NE: return AARCH64_NE;
5037         case EQ: return AARCH64_EQ;
5038         case GE: return AARCH64_GE;
5039         case GT: return AARCH64_GT;
5040         case LE: return AARCH64_LE;
5041         case LT: return AARCH64_LT;
5042         case GEU: return AARCH64_CS;
5043         case GTU: return AARCH64_HI;
5044         case LEU: return AARCH64_LS;
5045         case LTU: return AARCH64_CC;
5046         default: return -1;
5047         }
5048       break;
5049
5050     case E_CC_SWPmode:
5051       switch (comp_code)
5052         {
5053         case NE: return AARCH64_NE;
5054         case EQ: return AARCH64_EQ;
5055         case GE: return AARCH64_LE;
5056         case GT: return AARCH64_LT;
5057         case LE: return AARCH64_GE;
5058         case LT: return AARCH64_GT;
5059         case GEU: return AARCH64_LS;
5060         case GTU: return AARCH64_CC;
5061         case LEU: return AARCH64_CS;
5062         case LTU: return AARCH64_HI;
5063         default: return -1;
5064         }
5065       break;
5066
5067     case E_CC_NZmode:
5068       switch (comp_code)
5069         {
5070         case NE: return AARCH64_NE;
5071         case EQ: return AARCH64_EQ;
5072         case GE: return AARCH64_PL;
5073         case LT: return AARCH64_MI;
5074         default: return -1;
5075         }
5076       break;
5077
5078     case E_CC_Zmode:
5079       switch (comp_code)
5080         {
5081         case NE: return AARCH64_NE;
5082         case EQ: return AARCH64_EQ;
5083         default: return -1;
5084         }
5085       break;
5086
5087     case E_CC_Cmode:
5088       switch (comp_code)
5089         {
5090         case NE: return AARCH64_CS;
5091         case EQ: return AARCH64_CC;
5092         default: return -1;
5093         }
5094       break;
5095
5096     default:
5097       return -1;
5098     }
5099
5100   return -1;
5101 }
5102
5103 bool
5104 aarch64_const_vec_all_same_in_range_p (rtx x,
5105                                   HOST_WIDE_INT minval,
5106                                   HOST_WIDE_INT maxval)
5107 {
5108   HOST_WIDE_INT firstval;
5109   int count, i;
5110
5111   if (GET_CODE (x) != CONST_VECTOR
5112       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5113     return false;
5114
5115   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5116   if (firstval < minval || firstval > maxval)
5117     return false;
5118
5119   count = CONST_VECTOR_NUNITS (x);
5120   for (i = 1; i < count; i++)
5121     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5122       return false;
5123
5124   return true;
5125 }
5126
5127 bool
5128 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5129 {
5130   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5131 }
5132
5133
5134 /* N Z C V.  */
5135 #define AARCH64_CC_V 1
5136 #define AARCH64_CC_C (1 << 1)
5137 #define AARCH64_CC_Z (1 << 2)
5138 #define AARCH64_CC_N (1 << 3)
5139
5140 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5141 static const int aarch64_nzcv_codes[] =
5142 {
5143   0,            /* EQ, Z == 1.  */
5144   AARCH64_CC_Z, /* NE, Z == 0.  */
5145   0,            /* CS, C == 1.  */
5146   AARCH64_CC_C, /* CC, C == 0.  */
5147   0,            /* MI, N == 1.  */
5148   AARCH64_CC_N, /* PL, N == 0.  */
5149   0,            /* VS, V == 1.  */
5150   AARCH64_CC_V, /* VC, V == 0.  */
5151   0,            /* HI, C ==1 && Z == 0.  */
5152   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5153   AARCH64_CC_V, /* GE, N == V.  */
5154   0,            /* LT, N != V.  */
5155   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5156   0,            /* LE, !(Z == 0 && N == V).  */
5157   0,            /* AL, Any.  */
5158   0             /* NV, Any.  */
5159 };
5160
5161 /* Print operand X to file F in a target specific manner according to CODE.
5162    The acceptable formatting commands given by CODE are:
5163      'c':               An integer or symbol address without a preceding #
5164                         sign.
5165      'e':               Print the sign/zero-extend size as a character 8->b,
5166                         16->h, 32->w.
5167      'p':               Prints N such that 2^N == X (X must be power of 2 and
5168                         const int).
5169      'P':               Print the number of non-zero bits in X (a const_int).
5170      'H':               Print the higher numbered register of a pair (TImode)
5171                         of regs.
5172      'm':               Print a condition (eq, ne, etc).
5173      'M':               Same as 'm', but invert condition.
5174      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5175      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5176                         The register printed is the FP/SIMD register name
5177                         of X + 0/1/2/3 for S/T/U/V.
5178      'R':               Print a scalar FP/SIMD register name + 1.
5179      'X':               Print bottom 16 bits of integer constant in hex.
5180      'w/x':             Print a general register name or the zero register
5181                         (32-bit or 64-bit).
5182      '0':               Print a normal operand, if it's a general register,
5183                         then we assume DImode.
5184      'k':               Print NZCV for conditional compare instructions.
5185      'A':               Output address constant representing the first
5186                         argument of X, specifying a relocation offset
5187                         if appropriate.
5188      'L':               Output constant address specified by X
5189                         with a relocation offset if appropriate.
5190      'G':               Prints address of X, specifying a PC relative
5191                         relocation mode if appropriate.  */
5192
5193 static void
5194 aarch64_print_operand (FILE *f, rtx x, int code)
5195 {
5196   switch (code)
5197     {
5198     case 'c':
5199       switch (GET_CODE (x))
5200         {
5201         case CONST_INT:
5202           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5203           break;
5204
5205         case SYMBOL_REF:
5206           output_addr_const (f, x);
5207           break;
5208
5209         case CONST:
5210           if (GET_CODE (XEXP (x, 0)) == PLUS
5211               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5212             {
5213               output_addr_const (f, x);
5214               break;
5215             }
5216           /* Fall through.  */
5217
5218         default:
5219           output_operand_lossage ("Unsupported operand for code '%c'", code);
5220         }
5221       break;
5222
5223     case 'e':
5224       {
5225         int n;
5226
5227         if (!CONST_INT_P (x)
5228             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5229           {
5230             output_operand_lossage ("invalid operand for '%%%c'", code);
5231             return;
5232           }
5233
5234         switch (n)
5235           {
5236           case 3:
5237             fputc ('b', f);
5238             break;
5239           case 4:
5240             fputc ('h', f);
5241             break;
5242           case 5:
5243             fputc ('w', f);
5244             break;
5245           default:
5246             output_operand_lossage ("invalid operand for '%%%c'", code);
5247             return;
5248           }
5249       }
5250       break;
5251
5252     case 'p':
5253       {
5254         int n;
5255
5256         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5257           {
5258             output_operand_lossage ("invalid operand for '%%%c'", code);
5259             return;
5260           }
5261
5262         asm_fprintf (f, "%d", n);
5263       }
5264       break;
5265
5266     case 'P':
5267       if (!CONST_INT_P (x))
5268         {
5269           output_operand_lossage ("invalid operand for '%%%c'", code);
5270           return;
5271         }
5272
5273       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5274       break;
5275
5276     case 'H':
5277       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5278         {
5279           output_operand_lossage ("invalid operand for '%%%c'", code);
5280           return;
5281         }
5282
5283       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5284       break;
5285
5286     case 'M':
5287     case 'm':
5288       {
5289         int cond_code;
5290         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5291         if (x == const_true_rtx)
5292           {
5293             if (code == 'M')
5294               fputs ("nv", f);
5295             return;
5296           }
5297
5298         if (!COMPARISON_P (x))
5299           {
5300             output_operand_lossage ("invalid operand for '%%%c'", code);
5301             return;
5302           }
5303
5304         cond_code = aarch64_get_condition_code (x);
5305         gcc_assert (cond_code >= 0);
5306         if (code == 'M')
5307           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5308         fputs (aarch64_condition_codes[cond_code], f);
5309       }
5310       break;
5311
5312     case 'b':
5313     case 'h':
5314     case 's':
5315     case 'd':
5316     case 'q':
5317       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5318         {
5319           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5320           return;
5321         }
5322       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5323       break;
5324
5325     case 'S':
5326     case 'T':
5327     case 'U':
5328     case 'V':
5329       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5330         {
5331           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5332           return;
5333         }
5334       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5335       break;
5336
5337     case 'R':
5338       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5339         {
5340           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5341           return;
5342         }
5343       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5344       break;
5345
5346     case 'X':
5347       if (!CONST_INT_P (x))
5348         {
5349           output_operand_lossage ("invalid operand for '%%%c'", code);
5350           return;
5351         }
5352       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5353       break;
5354
5355     case 'w':
5356     case 'x':
5357       if (x == const0_rtx
5358           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5359         {
5360           asm_fprintf (f, "%czr", code);
5361           break;
5362         }
5363
5364       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5365         {
5366           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5367           break;
5368         }
5369
5370       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5371         {
5372           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5373           break;
5374         }
5375
5376       /* Fall through */
5377
5378     case 0:
5379       if (x == NULL)
5380         {
5381           output_operand_lossage ("missing operand");
5382           return;
5383         }
5384
5385       switch (GET_CODE (x))
5386         {
5387         case REG:
5388           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5389           break;
5390
5391         case MEM:
5392           output_address (GET_MODE (x), XEXP (x, 0));
5393           /* Check all memory references are Pmode - even with ILP32.  */
5394           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5395           break;
5396
5397         case CONST:
5398         case LABEL_REF:
5399         case SYMBOL_REF:
5400           output_addr_const (asm_out_file, x);
5401           break;
5402
5403         case CONST_INT:
5404           asm_fprintf (f, "%wd", INTVAL (x));
5405           break;
5406
5407         case CONST_VECTOR:
5408           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5409             {
5410               gcc_assert (
5411                   aarch64_const_vec_all_same_in_range_p (x,
5412                                                          HOST_WIDE_INT_MIN,
5413                                                          HOST_WIDE_INT_MAX));
5414               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5415             }
5416           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5417             {
5418               fputc ('0', f);
5419             }
5420           else
5421             gcc_unreachable ();
5422           break;
5423
5424         case CONST_DOUBLE:
5425           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5426              be getting CONST_DOUBLEs holding integers.  */
5427           gcc_assert (GET_MODE (x) != VOIDmode);
5428           if (aarch64_float_const_zero_rtx_p (x))
5429             {
5430               fputc ('0', f);
5431               break;
5432             }
5433           else if (aarch64_float_const_representable_p (x))
5434             {
5435 #define buf_size 20
5436               char float_buf[buf_size] = {'\0'};
5437               real_to_decimal_for_mode (float_buf,
5438                                         CONST_DOUBLE_REAL_VALUE (x),
5439                                         buf_size, buf_size,
5440                                         1, GET_MODE (x));
5441               asm_fprintf (asm_out_file, "%s", float_buf);
5442               break;
5443 #undef buf_size
5444             }
5445           output_operand_lossage ("invalid constant");
5446           return;
5447         default:
5448           output_operand_lossage ("invalid operand");
5449           return;
5450         }
5451       break;
5452
5453     case 'A':
5454       if (GET_CODE (x) == HIGH)
5455         x = XEXP (x, 0);
5456
5457       switch (aarch64_classify_symbolic_expression (x))
5458         {
5459         case SYMBOL_SMALL_GOT_4G:
5460           asm_fprintf (asm_out_file, ":got:");
5461           break;
5462
5463         case SYMBOL_SMALL_TLSGD:
5464           asm_fprintf (asm_out_file, ":tlsgd:");
5465           break;
5466
5467         case SYMBOL_SMALL_TLSDESC:
5468           asm_fprintf (asm_out_file, ":tlsdesc:");
5469           break;
5470
5471         case SYMBOL_SMALL_TLSIE:
5472           asm_fprintf (asm_out_file, ":gottprel:");
5473           break;
5474
5475         case SYMBOL_TLSLE24:
5476           asm_fprintf (asm_out_file, ":tprel:");
5477           break;
5478
5479         case SYMBOL_TINY_GOT:
5480           gcc_unreachable ();
5481           break;
5482
5483         default:
5484           break;
5485         }
5486       output_addr_const (asm_out_file, x);
5487       break;
5488
5489     case 'L':
5490       switch (aarch64_classify_symbolic_expression (x))
5491         {
5492         case SYMBOL_SMALL_GOT_4G:
5493           asm_fprintf (asm_out_file, ":lo12:");
5494           break;
5495
5496         case SYMBOL_SMALL_TLSGD:
5497           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5498           break;
5499
5500         case SYMBOL_SMALL_TLSDESC:
5501           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5502           break;
5503
5504         case SYMBOL_SMALL_TLSIE:
5505           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5506           break;
5507
5508         case SYMBOL_TLSLE12:
5509           asm_fprintf (asm_out_file, ":tprel_lo12:");
5510           break;
5511
5512         case SYMBOL_TLSLE24:
5513           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5514           break;
5515
5516         case SYMBOL_TINY_GOT:
5517           asm_fprintf (asm_out_file, ":got:");
5518           break;
5519
5520         case SYMBOL_TINY_TLSIE:
5521           asm_fprintf (asm_out_file, ":gottprel:");
5522           break;
5523
5524         default:
5525           break;
5526         }
5527       output_addr_const (asm_out_file, x);
5528       break;
5529
5530     case 'G':
5531       switch (aarch64_classify_symbolic_expression (x))
5532         {
5533         case SYMBOL_TLSLE24:
5534           asm_fprintf (asm_out_file, ":tprel_hi12:");
5535           break;
5536         default:
5537           break;
5538         }
5539       output_addr_const (asm_out_file, x);
5540       break;
5541
5542     case 'k':
5543       {
5544         HOST_WIDE_INT cond_code;
5545
5546         if (!CONST_INT_P (x))
5547           {
5548             output_operand_lossage ("invalid operand for '%%%c'", code);
5549             return;
5550           }
5551
5552         cond_code = INTVAL (x);
5553         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5554         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5555       }
5556       break;
5557
5558     default:
5559       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5560       return;
5561     }
5562 }
5563
5564 static void
5565 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5566 {
5567   struct aarch64_address_info addr;
5568
5569   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5570     switch (addr.type)
5571       {
5572       case ADDRESS_REG_IMM:
5573         if (addr.offset == const0_rtx)
5574           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5575         else
5576           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5577                        INTVAL (addr.offset));
5578         return;
5579
5580       case ADDRESS_REG_REG:
5581         if (addr.shift == 0)
5582           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5583                        reg_names [REGNO (addr.offset)]);
5584         else
5585           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5586                        reg_names [REGNO (addr.offset)], addr.shift);
5587         return;
5588
5589       case ADDRESS_REG_UXTW:
5590         if (addr.shift == 0)
5591           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5592                        REGNO (addr.offset) - R0_REGNUM);
5593         else
5594           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5595                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5596         return;
5597
5598       case ADDRESS_REG_SXTW:
5599         if (addr.shift == 0)
5600           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5601                        REGNO (addr.offset) - R0_REGNUM);
5602         else
5603           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5604                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5605         return;
5606
5607       case ADDRESS_REG_WB:
5608         switch (GET_CODE (x))
5609           {
5610           case PRE_INC:
5611             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5612                          GET_MODE_SIZE (mode));
5613             return;
5614           case POST_INC:
5615             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5616                          GET_MODE_SIZE (mode));
5617             return;
5618           case PRE_DEC:
5619             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5620                          GET_MODE_SIZE (mode));
5621             return;
5622           case POST_DEC:
5623             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5624                          GET_MODE_SIZE (mode));
5625             return;
5626           case PRE_MODIFY:
5627             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5628                          INTVAL (addr.offset));
5629             return;
5630           case POST_MODIFY:
5631             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5632                          INTVAL (addr.offset));
5633             return;
5634           default:
5635             break;
5636           }
5637         break;
5638
5639       case ADDRESS_LO_SUM:
5640         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5641         output_addr_const (f, addr.offset);
5642         asm_fprintf (f, "]");
5643         return;
5644
5645       case ADDRESS_SYMBOLIC:
5646         break;
5647       }
5648
5649   output_addr_const (f, x);
5650 }
5651
5652 bool
5653 aarch64_label_mentioned_p (rtx x)
5654 {
5655   const char *fmt;
5656   int i;
5657
5658   if (GET_CODE (x) == LABEL_REF)
5659     return true;
5660
5661   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5662      referencing instruction, but they are constant offsets, not
5663      symbols.  */
5664   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5665     return false;
5666
5667   fmt = GET_RTX_FORMAT (GET_CODE (x));
5668   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5669     {
5670       if (fmt[i] == 'E')
5671         {
5672           int j;
5673
5674           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5675             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5676               return 1;
5677         }
5678       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5679         return 1;
5680     }
5681
5682   return 0;
5683 }
5684
5685 /* Implement REGNO_REG_CLASS.  */
5686
5687 enum reg_class
5688 aarch64_regno_regclass (unsigned regno)
5689 {
5690   if (GP_REGNUM_P (regno))
5691     return GENERAL_REGS;
5692
5693   if (regno == SP_REGNUM)
5694     return STACK_REG;
5695
5696   if (regno == FRAME_POINTER_REGNUM
5697       || regno == ARG_POINTER_REGNUM)
5698     return POINTER_REGS;
5699
5700   if (FP_REGNUM_P (regno))
5701     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5702
5703   return NO_REGS;
5704 }
5705
5706 static rtx
5707 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5708 {
5709   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5710      where mask is selected by alignment and size of the offset.
5711      We try to pick as large a range for the offset as possible to
5712      maximize the chance of a CSE.  However, for aligned addresses
5713      we limit the range to 4k so that structures with different sized
5714      elements are likely to use the same base.  We need to be careful
5715      not to split a CONST for some forms of address expression, otherwise
5716      it will generate sub-optimal code.  */
5717
5718   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5719     {
5720       rtx base = XEXP (x, 0);
5721       rtx offset_rtx = XEXP (x, 1);
5722       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5723
5724       if (GET_CODE (base) == PLUS)
5725         {
5726           rtx op0 = XEXP (base, 0);
5727           rtx op1 = XEXP (base, 1);
5728
5729           /* Force any scaling into a temp for CSE.  */
5730           op0 = force_reg (Pmode, op0);
5731           op1 = force_reg (Pmode, op1);
5732
5733           /* Let the pointer register be in op0.  */
5734           if (REG_POINTER (op1))
5735             std::swap (op0, op1);
5736
5737           /* If the pointer is virtual or frame related, then we know that
5738              virtual register instantiation or register elimination is going
5739              to apply a second constant.  We want the two constants folded
5740              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5741           if (virt_or_elim_regno_p (REGNO (op0)))
5742             {
5743               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5744                                    NULL_RTX, true, OPTAB_DIRECT);
5745               return gen_rtx_PLUS (Pmode, base, op1);
5746             }
5747
5748           /* Otherwise, in order to encourage CSE (and thence loop strength
5749              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5750           base = expand_binop (Pmode, add_optab, op0, op1,
5751                                NULL_RTX, true, OPTAB_DIRECT);
5752           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5753         }
5754
5755       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5756       HOST_WIDE_INT base_offset;
5757       if (GET_MODE_SIZE (mode) > 16)
5758         base_offset = (offset + 0x400) & ~0x7f0;
5759       /* For offsets aren't a multiple of the access size, the limit is
5760          -256...255.  */
5761       else if (offset & (GET_MODE_SIZE (mode) - 1))
5762         {
5763           base_offset = (offset + 0x100) & ~0x1ff;
5764
5765           /* BLKmode typically uses LDP of X-registers.  */
5766           if (mode == BLKmode)
5767             base_offset = (offset + 512) & ~0x3ff;
5768         }
5769       /* Small negative offsets are supported.  */
5770       else if (IN_RANGE (offset, -256, 0))
5771         base_offset = 0;
5772       else if (mode == TImode || mode == TFmode)
5773         base_offset = (offset + 0x100) & ~0x1ff;
5774       /* Use 12-bit offset by access size.  */
5775       else
5776         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5777
5778       if (base_offset != 0)
5779         {
5780           base = plus_constant (Pmode, base, base_offset);
5781           base = force_operand (base, NULL_RTX);
5782           return plus_constant (Pmode, base, offset - base_offset);
5783         }
5784     }
5785
5786   return x;
5787 }
5788
5789 /* Return the reload icode required for a constant pool in mode.  */
5790 static enum insn_code
5791 aarch64_constant_pool_reload_icode (machine_mode mode)
5792 {
5793   switch (mode)
5794     {
5795     case E_SFmode:
5796       return CODE_FOR_aarch64_reload_movcpsfdi;
5797
5798     case E_DFmode:
5799       return CODE_FOR_aarch64_reload_movcpdfdi;
5800
5801     case E_TFmode:
5802       return CODE_FOR_aarch64_reload_movcptfdi;
5803
5804     case E_V8QImode:
5805       return CODE_FOR_aarch64_reload_movcpv8qidi;
5806
5807     case E_V16QImode:
5808       return CODE_FOR_aarch64_reload_movcpv16qidi;
5809
5810     case E_V4HImode:
5811       return CODE_FOR_aarch64_reload_movcpv4hidi;
5812
5813     case E_V8HImode:
5814       return CODE_FOR_aarch64_reload_movcpv8hidi;
5815
5816     case E_V2SImode:
5817       return CODE_FOR_aarch64_reload_movcpv2sidi;
5818
5819     case E_V4SImode:
5820       return CODE_FOR_aarch64_reload_movcpv4sidi;
5821
5822     case E_V2DImode:
5823       return CODE_FOR_aarch64_reload_movcpv2didi;
5824
5825     case E_V2DFmode:
5826       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5827
5828     default:
5829       gcc_unreachable ();
5830     }
5831
5832   gcc_unreachable ();
5833 }
5834 static reg_class_t
5835 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5836                           reg_class_t rclass,
5837                           machine_mode mode,
5838                           secondary_reload_info *sri)
5839 {
5840
5841   /* If we have to disable direct literal pool loads and stores because the
5842      function is too big, then we need a scratch register.  */
5843   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5844       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5845           || targetm.vector_mode_supported_p (GET_MODE (x)))
5846       && !aarch64_pcrelative_literal_loads)
5847     {
5848       sri->icode = aarch64_constant_pool_reload_icode (mode);
5849       return NO_REGS;
5850     }
5851
5852   /* Without the TARGET_SIMD instructions we cannot move a Q register
5853      to a Q register directly.  We need a scratch.  */
5854   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5855       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5856       && reg_class_subset_p (rclass, FP_REGS))
5857     {
5858       if (mode == TFmode)
5859         sri->icode = CODE_FOR_aarch64_reload_movtf;
5860       else if (mode == TImode)
5861         sri->icode = CODE_FOR_aarch64_reload_movti;
5862       return NO_REGS;
5863     }
5864
5865   /* A TFmode or TImode memory access should be handled via an FP_REGS
5866      because AArch64 has richer addressing modes for LDR/STR instructions
5867      than LDP/STP instructions.  */
5868   if (TARGET_FLOAT && rclass == GENERAL_REGS
5869       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5870     return FP_REGS;
5871
5872   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5873       return GENERAL_REGS;
5874
5875   return NO_REGS;
5876 }
5877
5878 static bool
5879 aarch64_can_eliminate (const int from, const int to)
5880 {
5881   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5882      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5883
5884   if (frame_pointer_needed)
5885     {
5886       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5887         return true;
5888       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5889         return false;
5890       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5891           && !cfun->calls_alloca)
5892         return true;
5893       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5894         return true;
5895
5896       return false;
5897     }
5898   else
5899     {
5900       /* If we decided that we didn't need a leaf frame pointer but then used
5901          LR in the function, then we'll want a frame pointer after all, so
5902          prevent this elimination to ensure a frame pointer is used.  */
5903       if (to == STACK_POINTER_REGNUM
5904           && flag_omit_leaf_frame_pointer
5905           && df_regs_ever_live_p (LR_REGNUM))
5906         return false;
5907     }
5908
5909   return true;
5910 }
5911
5912 HOST_WIDE_INT
5913 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5914 {
5915   aarch64_layout_frame ();
5916
5917   if (to == HARD_FRAME_POINTER_REGNUM)
5918     {
5919       if (from == ARG_POINTER_REGNUM)
5920         return cfun->machine->frame.hard_fp_offset;
5921
5922       if (from == FRAME_POINTER_REGNUM)
5923         return cfun->machine->frame.hard_fp_offset
5924                - cfun->machine->frame.locals_offset;
5925     }
5926
5927   if (to == STACK_POINTER_REGNUM)
5928     {
5929       if (from == FRAME_POINTER_REGNUM)
5930           return cfun->machine->frame.frame_size
5931                  - cfun->machine->frame.locals_offset;
5932     }
5933
5934   return cfun->machine->frame.frame_size;
5935 }
5936
5937 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5938    previous frame.  */
5939
5940 rtx
5941 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5942 {
5943   if (count != 0)
5944     return const0_rtx;
5945   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5946 }
5947
5948
5949 static void
5950 aarch64_asm_trampoline_template (FILE *f)
5951 {
5952   if (TARGET_ILP32)
5953     {
5954       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5955       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5956     }
5957   else
5958     {
5959       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5960       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5961     }
5962   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5963   assemble_aligned_integer (4, const0_rtx);
5964   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5965   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5966 }
5967
5968 static void
5969 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5970 {
5971   rtx fnaddr, mem, a_tramp;
5972   const int tramp_code_sz = 16;
5973
5974   /* Don't need to copy the trailing D-words, we fill those in below.  */
5975   emit_block_move (m_tramp, assemble_trampoline_template (),
5976                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5977   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5978   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5979   if (GET_MODE (fnaddr) != ptr_mode)
5980     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5981   emit_move_insn (mem, fnaddr);
5982
5983   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5984   emit_move_insn (mem, chain_value);
5985
5986   /* XXX We should really define a "clear_cache" pattern and use
5987      gen_clear_cache().  */
5988   a_tramp = XEXP (m_tramp, 0);
5989   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5990                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
5991                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5992                      ptr_mode);
5993 }
5994
5995 static unsigned char
5996 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5997 {
5998   switch (regclass)
5999     {
6000     case CALLER_SAVE_REGS:
6001     case POINTER_REGS:
6002     case GENERAL_REGS:
6003     case ALL_REGS:
6004     case FP_REGS:
6005     case FP_LO_REGS:
6006       return
6007         aarch64_vector_mode_p (mode)
6008           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6009           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6010     case STACK_REG:
6011       return 1;
6012
6013     case NO_REGS:
6014       return 0;
6015
6016     default:
6017       break;
6018     }
6019   gcc_unreachable ();
6020 }
6021
6022 static reg_class_t
6023 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6024 {
6025   if (regclass == POINTER_REGS)
6026     return GENERAL_REGS;
6027
6028   if (regclass == STACK_REG)
6029     {
6030       if (REG_P(x)
6031           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6032           return regclass;
6033
6034       return NO_REGS;
6035     }
6036
6037   /* Register eliminiation can result in a request for
6038      SP+constant->FP_REGS.  We cannot support such operations which
6039      use SP as source and an FP_REG as destination, so reject out
6040      right now.  */
6041   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6042     {
6043       rtx lhs = XEXP (x, 0);
6044
6045       /* Look through a possible SUBREG introduced by ILP32.  */
6046       if (GET_CODE (lhs) == SUBREG)
6047         lhs = SUBREG_REG (lhs);
6048
6049       gcc_assert (REG_P (lhs));
6050       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6051                                       POINTER_REGS));
6052       return NO_REGS;
6053     }
6054
6055   return regclass;
6056 }
6057
6058 void
6059 aarch64_asm_output_labelref (FILE* f, const char *name)
6060 {
6061   asm_fprintf (f, "%U%s", name);
6062 }
6063
6064 static void
6065 aarch64_elf_asm_constructor (rtx symbol, int priority)
6066 {
6067   if (priority == DEFAULT_INIT_PRIORITY)
6068     default_ctor_section_asm_out_constructor (symbol, priority);
6069   else
6070     {
6071       section *s;
6072       /* While priority is known to be in range [0, 65535], so 18 bytes
6073          would be enough, the compiler might not know that.  To avoid
6074          -Wformat-truncation false positive, use a larger size.  */
6075       char buf[23];
6076       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6077       s = get_section (buf, SECTION_WRITE, NULL);
6078       switch_to_section (s);
6079       assemble_align (POINTER_SIZE);
6080       assemble_aligned_integer (POINTER_BYTES, symbol);
6081     }
6082 }
6083
6084 static void
6085 aarch64_elf_asm_destructor (rtx symbol, int priority)
6086 {
6087   if (priority == DEFAULT_INIT_PRIORITY)
6088     default_dtor_section_asm_out_destructor (symbol, priority);
6089   else
6090     {
6091       section *s;
6092       /* While priority is known to be in range [0, 65535], so 18 bytes
6093          would be enough, the compiler might not know that.  To avoid
6094          -Wformat-truncation false positive, use a larger size.  */
6095       char buf[23];
6096       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6097       s = get_section (buf, SECTION_WRITE, NULL);
6098       switch_to_section (s);
6099       assemble_align (POINTER_SIZE);
6100       assemble_aligned_integer (POINTER_BYTES, symbol);
6101     }
6102 }
6103
6104 const char*
6105 aarch64_output_casesi (rtx *operands)
6106 {
6107   char buf[100];
6108   char label[100];
6109   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6110   int index;
6111   static const char *const patterns[4][2] =
6112   {
6113     {
6114       "ldrb\t%w3, [%0,%w1,uxtw]",
6115       "add\t%3, %4, %w3, sxtb #2"
6116     },
6117     {
6118       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6119       "add\t%3, %4, %w3, sxth #2"
6120     },
6121     {
6122       "ldr\t%w3, [%0,%w1,uxtw #2]",
6123       "add\t%3, %4, %w3, sxtw #2"
6124     },
6125     /* We assume that DImode is only generated when not optimizing and
6126        that we don't really need 64-bit address offsets.  That would
6127        imply an object file with 8GB of code in a single function!  */
6128     {
6129       "ldr\t%w3, [%0,%w1,uxtw #2]",
6130       "add\t%3, %4, %w3, sxtw #2"
6131     }
6132   };
6133
6134   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6135
6136   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6137
6138   gcc_assert (index >= 0 && index <= 3);
6139
6140   /* Need to implement table size reduction, by chaning the code below.  */
6141   output_asm_insn (patterns[index][0], operands);
6142   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6143   snprintf (buf, sizeof (buf),
6144             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6145   output_asm_insn (buf, operands);
6146   output_asm_insn (patterns[index][1], operands);
6147   output_asm_insn ("br\t%3", operands);
6148   assemble_label (asm_out_file, label);
6149   return "";
6150 }
6151
6152
6153 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6154    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6155    operator.  */
6156
6157 int
6158 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6159 {
6160   if (shift >= 0 && shift <= 3)
6161     {
6162       int size;
6163       for (size = 8; size <= 32; size *= 2)
6164         {
6165           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6166           if (mask == bits << shift)
6167             return size;
6168         }
6169     }
6170   return 0;
6171 }
6172
6173 /* Constant pools are per function only when PC relative
6174    literal loads are true or we are in the large memory
6175    model.  */
6176
6177 static inline bool
6178 aarch64_can_use_per_function_literal_pools_p (void)
6179 {
6180   return (aarch64_pcrelative_literal_loads
6181           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6182 }
6183
6184 static bool
6185 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6186 {
6187   /* Fixme:: In an ideal world this would work similar
6188      to the logic in aarch64_select_rtx_section but this
6189      breaks bootstrap in gcc go.  For now we workaround
6190      this by returning false here.  */
6191   return false;
6192 }
6193
6194 /* Select appropriate section for constants depending
6195    on where we place literal pools.  */
6196
6197 static section *
6198 aarch64_select_rtx_section (machine_mode mode,
6199                             rtx x,
6200                             unsigned HOST_WIDE_INT align)
6201 {
6202   if (aarch64_can_use_per_function_literal_pools_p ())
6203     return function_section (current_function_decl);
6204
6205   return default_elf_select_rtx_section (mode, x, align);
6206 }
6207
6208 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6209 void
6210 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6211                                   HOST_WIDE_INT offset)
6212 {
6213   /* When using per-function literal pools, we must ensure that any code
6214      section is aligned to the minimal instruction length, lest we get
6215      errors from the assembler re "unaligned instructions".  */
6216   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6217     ASM_OUTPUT_ALIGN (f, 2);
6218 }
6219
6220 /* Costs.  */
6221
6222 /* Helper function for rtx cost calculation.  Strip a shift expression
6223    from X.  Returns the inner operand if successful, or the original
6224    expression on failure.  */
6225 static rtx
6226 aarch64_strip_shift (rtx x)
6227 {
6228   rtx op = x;
6229
6230   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6231      we can convert both to ROR during final output.  */
6232   if ((GET_CODE (op) == ASHIFT
6233        || GET_CODE (op) == ASHIFTRT
6234        || GET_CODE (op) == LSHIFTRT
6235        || GET_CODE (op) == ROTATERT
6236        || GET_CODE (op) == ROTATE)
6237       && CONST_INT_P (XEXP (op, 1)))
6238     return XEXP (op, 0);
6239
6240   if (GET_CODE (op) == MULT
6241       && CONST_INT_P (XEXP (op, 1))
6242       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6243     return XEXP (op, 0);
6244
6245   return x;
6246 }
6247
6248 /* Helper function for rtx cost calculation.  Strip an extend
6249    expression from X.  Returns the inner operand if successful, or the
6250    original expression on failure.  We deal with a number of possible
6251    canonicalization variations here. If STRIP_SHIFT is true, then
6252    we can strip off a shift also.  */
6253 static rtx
6254 aarch64_strip_extend (rtx x, bool strip_shift)
6255 {
6256   rtx op = x;
6257
6258   /* Zero and sign extraction of a widened value.  */
6259   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6260       && XEXP (op, 2) == const0_rtx
6261       && GET_CODE (XEXP (op, 0)) == MULT
6262       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6263                                          XEXP (op, 1)))
6264     return XEXP (XEXP (op, 0), 0);
6265
6266   /* It can also be represented (for zero-extend) as an AND with an
6267      immediate.  */
6268   if (GET_CODE (op) == AND
6269       && GET_CODE (XEXP (op, 0)) == MULT
6270       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6271       && CONST_INT_P (XEXP (op, 1))
6272       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6273                            INTVAL (XEXP (op, 1))) != 0)
6274     return XEXP (XEXP (op, 0), 0);
6275
6276   /* Now handle extended register, as this may also have an optional
6277      left shift by 1..4.  */
6278   if (strip_shift
6279       && GET_CODE (op) == ASHIFT
6280       && CONST_INT_P (XEXP (op, 1))
6281       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6282     op = XEXP (op, 0);
6283
6284   if (GET_CODE (op) == ZERO_EXTEND
6285       || GET_CODE (op) == SIGN_EXTEND)
6286     op = XEXP (op, 0);
6287
6288   if (op != x)
6289     return op;
6290
6291   return x;
6292 }
6293
6294 /* Return true iff CODE is a shift supported in combination
6295    with arithmetic instructions.  */
6296
6297 static bool
6298 aarch64_shift_p (enum rtx_code code)
6299 {
6300   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6301 }
6302
6303
6304 /* Return true iff X is a cheap shift without a sign extend. */
6305
6306 static bool
6307 aarch64_cheap_mult_shift_p (rtx x)
6308 {
6309   rtx op0, op1;
6310
6311   op0 = XEXP (x, 0);
6312   op1 = XEXP (x, 1);
6313
6314   if (!(aarch64_tune_params.extra_tuning_flags
6315                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6316     return false;
6317
6318   if (GET_CODE (op0) == SIGN_EXTEND)
6319     return false;
6320
6321   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6322       && UINTVAL (op1) <= 4)
6323     return true;
6324
6325   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6326     return false;
6327
6328   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6329
6330   if (l2 > 0 && l2 <= 4)
6331     return true;
6332
6333   return false;
6334 }
6335
6336 /* Helper function for rtx cost calculation.  Calculate the cost of
6337    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6338    Return the calculated cost of the expression, recursing manually in to
6339    operands where needed.  */
6340
6341 static int
6342 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6343 {
6344   rtx op0, op1;
6345   const struct cpu_cost_table *extra_cost
6346     = aarch64_tune_params.insn_extra_cost;
6347   int cost = 0;
6348   bool compound_p = (outer == PLUS || outer == MINUS);
6349   machine_mode mode = GET_MODE (x);
6350
6351   gcc_checking_assert (code == MULT);
6352
6353   op0 = XEXP (x, 0);
6354   op1 = XEXP (x, 1);
6355
6356   if (VECTOR_MODE_P (mode))
6357     mode = GET_MODE_INNER (mode);
6358
6359   /* Integer multiply/fma.  */
6360   if (GET_MODE_CLASS (mode) == MODE_INT)
6361     {
6362       /* The multiply will be canonicalized as a shift, cost it as such.  */
6363       if (aarch64_shift_p (GET_CODE (x))
6364           || (CONST_INT_P (op1)
6365               && exact_log2 (INTVAL (op1)) > 0))
6366         {
6367           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6368                            || GET_CODE (op0) == SIGN_EXTEND;
6369           if (speed)
6370             {
6371               if (compound_p)
6372                 {
6373                   /* If the shift is considered cheap,
6374                      then don't add any cost. */
6375                   if (aarch64_cheap_mult_shift_p (x))
6376                     ;
6377                   else if (REG_P (op1))
6378                     /* ARITH + shift-by-register.  */
6379                     cost += extra_cost->alu.arith_shift_reg;
6380                   else if (is_extend)
6381                     /* ARITH + extended register.  We don't have a cost field
6382                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6383                     cost += extra_cost->alu.extend_arith;
6384                   else
6385                     /* ARITH + shift-by-immediate.  */
6386                     cost += extra_cost->alu.arith_shift;
6387                 }
6388               else
6389                 /* LSL (immediate).  */
6390                 cost += extra_cost->alu.shift;
6391
6392             }
6393           /* Strip extends as we will have costed them in the case above.  */
6394           if (is_extend)
6395             op0 = aarch64_strip_extend (op0, true);
6396
6397           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6398
6399           return cost;
6400         }
6401
6402       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6403          compound and let the below cases handle it.  After all, MNEG is a
6404          special-case alias of MSUB.  */
6405       if (GET_CODE (op0) == NEG)
6406         {
6407           op0 = XEXP (op0, 0);
6408           compound_p = true;
6409         }
6410
6411       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6412       if ((GET_CODE (op0) == ZERO_EXTEND
6413            && GET_CODE (op1) == ZERO_EXTEND)
6414           || (GET_CODE (op0) == SIGN_EXTEND
6415               && GET_CODE (op1) == SIGN_EXTEND))
6416         {
6417           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6418           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6419
6420           if (speed)
6421             {
6422               if (compound_p)
6423                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6424                 cost += extra_cost->mult[0].extend_add;
6425               else
6426                 /* MUL/SMULL/UMULL.  */
6427                 cost += extra_cost->mult[0].extend;
6428             }
6429
6430           return cost;
6431         }
6432
6433       /* This is either an integer multiply or a MADD.  In both cases
6434          we want to recurse and cost the operands.  */
6435       cost += rtx_cost (op0, mode, MULT, 0, speed);
6436       cost += rtx_cost (op1, mode, MULT, 1, speed);
6437
6438       if (speed)
6439         {
6440           if (compound_p)
6441             /* MADD/MSUB.  */
6442             cost += extra_cost->mult[mode == DImode].add;
6443           else
6444             /* MUL.  */
6445             cost += extra_cost->mult[mode == DImode].simple;
6446         }
6447
6448       return cost;
6449     }
6450   else
6451     {
6452       if (speed)
6453         {
6454           /* Floating-point FMA/FMUL can also support negations of the
6455              operands, unless the rounding mode is upward or downward in
6456              which case FNMUL is different than FMUL with operand negation.  */
6457           bool neg0 = GET_CODE (op0) == NEG;
6458           bool neg1 = GET_CODE (op1) == NEG;
6459           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6460             {
6461               if (neg0)
6462                 op0 = XEXP (op0, 0);
6463               if (neg1)
6464                 op1 = XEXP (op1, 0);
6465             }
6466
6467           if (compound_p)
6468             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6469             cost += extra_cost->fp[mode == DFmode].fma;
6470           else
6471             /* FMUL/FNMUL.  */
6472             cost += extra_cost->fp[mode == DFmode].mult;
6473         }
6474
6475       cost += rtx_cost (op0, mode, MULT, 0, speed);
6476       cost += rtx_cost (op1, mode, MULT, 1, speed);
6477       return cost;
6478     }
6479 }
6480
6481 static int
6482 aarch64_address_cost (rtx x,
6483                       machine_mode mode,
6484                       addr_space_t as ATTRIBUTE_UNUSED,
6485                       bool speed)
6486 {
6487   enum rtx_code c = GET_CODE (x);
6488   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6489   struct aarch64_address_info info;
6490   int cost = 0;
6491   info.shift = 0;
6492
6493   if (!aarch64_classify_address (&info, x, mode, c, false))
6494     {
6495       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6496         {
6497           /* This is a CONST or SYMBOL ref which will be split
6498              in a different way depending on the code model in use.
6499              Cost it through the generic infrastructure.  */
6500           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6501           /* Divide through by the cost of one instruction to
6502              bring it to the same units as the address costs.  */
6503           cost_symbol_ref /= COSTS_N_INSNS (1);
6504           /* The cost is then the cost of preparing the address,
6505              followed by an immediate (possibly 0) offset.  */
6506           return cost_symbol_ref + addr_cost->imm_offset;
6507         }
6508       else
6509         {
6510           /* This is most likely a jump table from a case
6511              statement.  */
6512           return addr_cost->register_offset;
6513         }
6514     }
6515
6516   switch (info.type)
6517     {
6518       case ADDRESS_LO_SUM:
6519       case ADDRESS_SYMBOLIC:
6520       case ADDRESS_REG_IMM:
6521         cost += addr_cost->imm_offset;
6522         break;
6523
6524       case ADDRESS_REG_WB:
6525         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6526           cost += addr_cost->pre_modify;
6527         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6528           cost += addr_cost->post_modify;
6529         else
6530           gcc_unreachable ();
6531
6532         break;
6533
6534       case ADDRESS_REG_REG:
6535         cost += addr_cost->register_offset;
6536         break;
6537
6538       case ADDRESS_REG_SXTW:
6539         cost += addr_cost->register_sextend;
6540         break;
6541
6542       case ADDRESS_REG_UXTW:
6543         cost += addr_cost->register_zextend;
6544         break;
6545
6546       default:
6547         gcc_unreachable ();
6548     }
6549
6550
6551   if (info.shift > 0)
6552     {
6553       /* For the sake of calculating the cost of the shifted register
6554          component, we can treat same sized modes in the same way.  */
6555       switch (GET_MODE_BITSIZE (mode))
6556         {
6557           case 16:
6558             cost += addr_cost->addr_scale_costs.hi;
6559             break;
6560
6561           case 32:
6562             cost += addr_cost->addr_scale_costs.si;
6563             break;
6564
6565           case 64:
6566             cost += addr_cost->addr_scale_costs.di;
6567             break;
6568
6569           /* We can't tell, or this is a 128-bit vector.  */
6570           default:
6571             cost += addr_cost->addr_scale_costs.ti;
6572             break;
6573         }
6574     }
6575
6576   return cost;
6577 }
6578
6579 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6580    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6581    to be taken.  */
6582
6583 int
6584 aarch64_branch_cost (bool speed_p, bool predictable_p)
6585 {
6586   /* When optimizing for speed, use the cost of unpredictable branches.  */
6587   const struct cpu_branch_cost *branch_costs =
6588     aarch64_tune_params.branch_costs;
6589
6590   if (!speed_p || predictable_p)
6591     return branch_costs->predictable;
6592   else
6593     return branch_costs->unpredictable;
6594 }
6595
6596 /* Return true if the RTX X in mode MODE is a zero or sign extract
6597    usable in an ADD or SUB (extended register) instruction.  */
6598 static bool
6599 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6600 {
6601   /* Catch add with a sign extract.
6602      This is add_<optab><mode>_multp2.  */
6603   if (GET_CODE (x) == SIGN_EXTRACT
6604       || GET_CODE (x) == ZERO_EXTRACT)
6605     {
6606       rtx op0 = XEXP (x, 0);
6607       rtx op1 = XEXP (x, 1);
6608       rtx op2 = XEXP (x, 2);
6609
6610       if (GET_CODE (op0) == MULT
6611           && CONST_INT_P (op1)
6612           && op2 == const0_rtx
6613           && CONST_INT_P (XEXP (op0, 1))
6614           && aarch64_is_extend_from_extract (mode,
6615                                              XEXP (op0, 1),
6616                                              op1))
6617         {
6618           return true;
6619         }
6620     }
6621   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6622      No shift.  */
6623   else if (GET_CODE (x) == SIGN_EXTEND
6624            || GET_CODE (x) == ZERO_EXTEND)
6625     return REG_P (XEXP (x, 0));
6626
6627   return false;
6628 }
6629
6630 static bool
6631 aarch64_frint_unspec_p (unsigned int u)
6632 {
6633   switch (u)
6634     {
6635       case UNSPEC_FRINTZ:
6636       case UNSPEC_FRINTP:
6637       case UNSPEC_FRINTM:
6638       case UNSPEC_FRINTA:
6639       case UNSPEC_FRINTN:
6640       case UNSPEC_FRINTX:
6641       case UNSPEC_FRINTI:
6642         return true;
6643
6644       default:
6645         return false;
6646     }
6647 }
6648
6649 /* Return true iff X is an rtx that will match an extr instruction
6650    i.e. as described in the *extr<mode>5_insn family of patterns.
6651    OP0 and OP1 will be set to the operands of the shifts involved
6652    on success and will be NULL_RTX otherwise.  */
6653
6654 static bool
6655 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6656 {
6657   rtx op0, op1;
6658   machine_mode mode = GET_MODE (x);
6659
6660   *res_op0 = NULL_RTX;
6661   *res_op1 = NULL_RTX;
6662
6663   if (GET_CODE (x) != IOR)
6664     return false;
6665
6666   op0 = XEXP (x, 0);
6667   op1 = XEXP (x, 1);
6668
6669   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6670       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6671     {
6672      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6673       if (GET_CODE (op1) == ASHIFT)
6674         std::swap (op0, op1);
6675
6676       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6677         return false;
6678
6679       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6680       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6681
6682       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6683           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6684         {
6685           *res_op0 = XEXP (op0, 0);
6686           *res_op1 = XEXP (op1, 0);
6687           return true;
6688         }
6689     }
6690
6691   return false;
6692 }
6693
6694 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6695    storing it in *COST.  Result is true if the total cost of the operation
6696    has now been calculated.  */
6697 static bool
6698 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6699 {
6700   rtx inner;
6701   rtx comparator;
6702   enum rtx_code cmpcode;
6703
6704   if (COMPARISON_P (op0))
6705     {
6706       inner = XEXP (op0, 0);
6707       comparator = XEXP (op0, 1);
6708       cmpcode = GET_CODE (op0);
6709     }
6710   else
6711     {
6712       inner = op0;
6713       comparator = const0_rtx;
6714       cmpcode = NE;
6715     }
6716
6717   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6718     {
6719       /* Conditional branch.  */
6720       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6721         return true;
6722       else
6723         {
6724           if (cmpcode == NE || cmpcode == EQ)
6725             {
6726               if (comparator == const0_rtx)
6727                 {
6728                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6729                   if (GET_CODE (inner) == ZERO_EXTRACT)
6730                     /* TBZ/TBNZ.  */
6731                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6732                                        ZERO_EXTRACT, 0, speed);
6733                   else
6734                     /* CBZ/CBNZ.  */
6735                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6736
6737                 return true;
6738               }
6739             }
6740           else if (cmpcode == LT || cmpcode == GE)
6741             {
6742               /* TBZ/TBNZ.  */
6743               if (comparator == const0_rtx)
6744                 return true;
6745             }
6746         }
6747     }
6748   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6749     {
6750       /* CCMP.  */
6751       if (GET_CODE (op1) == COMPARE)
6752         {
6753           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6754           if (XEXP (op1, 1) == const0_rtx)
6755             *cost += 1;
6756           if (speed)
6757             {
6758               machine_mode mode = GET_MODE (XEXP (op1, 0));
6759               const struct cpu_cost_table *extra_cost
6760                 = aarch64_tune_params.insn_extra_cost;
6761
6762               if (GET_MODE_CLASS (mode) == MODE_INT)
6763                 *cost += extra_cost->alu.arith;
6764               else
6765                 *cost += extra_cost->fp[mode == DFmode].compare;
6766             }
6767           return true;
6768         }
6769
6770       /* It's a conditional operation based on the status flags,
6771          so it must be some flavor of CSEL.  */
6772
6773       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6774       if (GET_CODE (op1) == NEG
6775           || GET_CODE (op1) == NOT
6776           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6777         op1 = XEXP (op1, 0);
6778       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6779         {
6780           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6781           op1 = XEXP (op1, 0);
6782           op2 = XEXP (op2, 0);
6783         }
6784
6785       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6786       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6787       return true;
6788     }
6789
6790   /* We don't know what this is, cost all operands.  */
6791   return false;
6792 }
6793
6794 /* Check whether X is a bitfield operation of the form shift + extend that
6795    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6796    operand to which the bitfield operation is applied.  Otherwise return
6797    NULL_RTX.  */
6798
6799 static rtx
6800 aarch64_extend_bitfield_pattern_p (rtx x)
6801 {
6802   rtx_code outer_code = GET_CODE (x);
6803   machine_mode outer_mode = GET_MODE (x);
6804
6805   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6806       && outer_mode != SImode && outer_mode != DImode)
6807     return NULL_RTX;
6808
6809   rtx inner = XEXP (x, 0);
6810   rtx_code inner_code = GET_CODE (inner);
6811   machine_mode inner_mode = GET_MODE (inner);
6812   rtx op = NULL_RTX;
6813
6814   switch (inner_code)
6815     {
6816       case ASHIFT:
6817         if (CONST_INT_P (XEXP (inner, 1))
6818             && (inner_mode == QImode || inner_mode == HImode))
6819           op = XEXP (inner, 0);
6820         break;
6821       case LSHIFTRT:
6822         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6823             && (inner_mode == QImode || inner_mode == HImode))
6824           op = XEXP (inner, 0);
6825         break;
6826       case ASHIFTRT:
6827         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6828             && (inner_mode == QImode || inner_mode == HImode))
6829           op = XEXP (inner, 0);
6830         break;
6831       default:
6832         break;
6833     }
6834
6835   return op;
6836 }
6837
6838 /* Return true if the mask and a shift amount from an RTX of the form
6839    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6840    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6841
6842 bool
6843 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6844 {
6845   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6846          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6847          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6848          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6849 }
6850
6851 /* Calculate the cost of calculating X, storing it in *COST.  Result
6852    is true if the total cost of the operation has now been calculated.  */
6853 static bool
6854 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6855                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6856 {
6857   rtx op0, op1, op2;
6858   const struct cpu_cost_table *extra_cost
6859     = aarch64_tune_params.insn_extra_cost;
6860   int code = GET_CODE (x);
6861   scalar_int_mode int_mode;
6862
6863   /* By default, assume that everything has equivalent cost to the
6864      cheapest instruction.  Any additional costs are applied as a delta
6865      above this default.  */
6866   *cost = COSTS_N_INSNS (1);
6867
6868   switch (code)
6869     {
6870     case SET:
6871       /* The cost depends entirely on the operands to SET.  */
6872       *cost = 0;
6873       op0 = SET_DEST (x);
6874       op1 = SET_SRC (x);
6875
6876       switch (GET_CODE (op0))
6877         {
6878         case MEM:
6879           if (speed)
6880             {
6881               rtx address = XEXP (op0, 0);
6882               if (VECTOR_MODE_P (mode))
6883                 *cost += extra_cost->ldst.storev;
6884               else if (GET_MODE_CLASS (mode) == MODE_INT)
6885                 *cost += extra_cost->ldst.store;
6886               else if (mode == SFmode)
6887                 *cost += extra_cost->ldst.storef;
6888               else if (mode == DFmode)
6889                 *cost += extra_cost->ldst.stored;
6890
6891               *cost +=
6892                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6893                                                      0, speed));
6894             }
6895
6896           *cost += rtx_cost (op1, mode, SET, 1, speed);
6897           return true;
6898
6899         case SUBREG:
6900           if (! REG_P (SUBREG_REG (op0)))
6901             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6902
6903           /* Fall through.  */
6904         case REG:
6905           /* The cost is one per vector-register copied.  */
6906           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6907             {
6908               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6909                               / GET_MODE_SIZE (V4SImode);
6910               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6911             }
6912           /* const0_rtx is in general free, but we will use an
6913              instruction to set a register to 0.  */
6914           else if (REG_P (op1) || op1 == const0_rtx)
6915             {
6916               /* The cost is 1 per register copied.  */
6917               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6918                               / UNITS_PER_WORD;
6919               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6920             }
6921           else
6922             /* Cost is just the cost of the RHS of the set.  */
6923             *cost += rtx_cost (op1, mode, SET, 1, speed);
6924           return true;
6925
6926         case ZERO_EXTRACT:
6927         case SIGN_EXTRACT:
6928           /* Bit-field insertion.  Strip any redundant widening of
6929              the RHS to meet the width of the target.  */
6930           if (GET_CODE (op1) == SUBREG)
6931             op1 = SUBREG_REG (op1);
6932           if ((GET_CODE (op1) == ZERO_EXTEND
6933                || GET_CODE (op1) == SIGN_EXTEND)
6934               && CONST_INT_P (XEXP (op0, 1))
6935               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6936                   >= INTVAL (XEXP (op0, 1))))
6937             op1 = XEXP (op1, 0);
6938
6939           if (CONST_INT_P (op1))
6940             {
6941               /* MOV immediate is assumed to always be cheap.  */
6942               *cost = COSTS_N_INSNS (1);
6943             }
6944           else
6945             {
6946               /* BFM.  */
6947               if (speed)
6948                 *cost += extra_cost->alu.bfi;
6949               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6950             }
6951
6952           return true;
6953
6954         default:
6955           /* We can't make sense of this, assume default cost.  */
6956           *cost = COSTS_N_INSNS (1);
6957           return false;
6958         }
6959       return false;
6960
6961     case CONST_INT:
6962       /* If an instruction can incorporate a constant within the
6963          instruction, the instruction's expression avoids calling
6964          rtx_cost() on the constant.  If rtx_cost() is called on a
6965          constant, then it is usually because the constant must be
6966          moved into a register by one or more instructions.
6967
6968          The exception is constant 0, which can be expressed
6969          as XZR/WZR and is therefore free.  The exception to this is
6970          if we have (set (reg) (const0_rtx)) in which case we must cost
6971          the move.  However, we can catch that when we cost the SET, so
6972          we don't need to consider that here.  */
6973       if (x == const0_rtx)
6974         *cost = 0;
6975       else
6976         {
6977           /* To an approximation, building any other constant is
6978              proportionally expensive to the number of instructions
6979              required to build that constant.  This is true whether we
6980              are compiling for SPEED or otherwise.  */
6981           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6982                                  (NULL_RTX, x, false, mode));
6983         }
6984       return true;
6985
6986     case CONST_DOUBLE:
6987
6988       /* First determine number of instructions to do the move
6989           as an integer constant.  */
6990       if (!aarch64_float_const_representable_p (x)
6991            && !aarch64_can_const_movi_rtx_p (x, mode)
6992            && aarch64_float_const_rtx_p (x))
6993         {
6994           unsigned HOST_WIDE_INT ival;
6995           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6996           gcc_assert (succeed);
6997
6998           machine_mode imode = (mode == HFmode
6999                                 ? SImode
7000                                 : int_mode_for_mode (mode).require ());
7001           int ncost = aarch64_internal_mov_immediate
7002                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7003           *cost += COSTS_N_INSNS (ncost);
7004           return true;
7005         }
7006
7007       if (speed)
7008         {
7009           /* mov[df,sf]_aarch64.  */
7010           if (aarch64_float_const_representable_p (x))
7011             /* FMOV (scalar immediate).  */
7012             *cost += extra_cost->fp[mode == DFmode].fpconst;
7013           else if (!aarch64_float_const_zero_rtx_p (x))
7014             {
7015               /* This will be a load from memory.  */
7016               if (mode == DFmode)
7017                 *cost += extra_cost->ldst.loadd;
7018               else
7019                 *cost += extra_cost->ldst.loadf;
7020             }
7021           else
7022             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7023                or MOV v0.s[0], wzr - neither of which are modeled by the
7024                cost tables.  Just use the default cost.  */
7025             {
7026             }
7027         }
7028
7029       return true;
7030
7031     case MEM:
7032       if (speed)
7033         {
7034           /* For loads we want the base cost of a load, plus an
7035              approximation for the additional cost of the addressing
7036              mode.  */
7037           rtx address = XEXP (x, 0);
7038           if (VECTOR_MODE_P (mode))
7039             *cost += extra_cost->ldst.loadv;
7040           else if (GET_MODE_CLASS (mode) == MODE_INT)
7041             *cost += extra_cost->ldst.load;
7042           else if (mode == SFmode)
7043             *cost += extra_cost->ldst.loadf;
7044           else if (mode == DFmode)
7045             *cost += extra_cost->ldst.loadd;
7046
7047           *cost +=
7048                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7049                                                      0, speed));
7050         }
7051
7052       return true;
7053
7054     case NEG:
7055       op0 = XEXP (x, 0);
7056
7057       if (VECTOR_MODE_P (mode))
7058         {
7059           if (speed)
7060             {
7061               /* FNEG.  */
7062               *cost += extra_cost->vect.alu;
7063             }
7064           return false;
7065         }
7066
7067       if (GET_MODE_CLASS (mode) == MODE_INT)
7068         {
7069           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7070               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7071             {
7072               /* CSETM.  */
7073               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7074               return true;
7075             }
7076
7077           /* Cost this as SUB wzr, X.  */
7078           op0 = CONST0_RTX (mode);
7079           op1 = XEXP (x, 0);
7080           goto cost_minus;
7081         }
7082
7083       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7084         {
7085           /* Support (neg(fma...)) as a single instruction only if
7086              sign of zeros is unimportant.  This matches the decision
7087              making in aarch64.md.  */
7088           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7089             {
7090               /* FNMADD.  */
7091               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7092               return true;
7093             }
7094           if (GET_CODE (op0) == MULT)
7095             {
7096               /* FNMUL.  */
7097               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7098               return true;
7099             }
7100           if (speed)
7101             /* FNEG.  */
7102             *cost += extra_cost->fp[mode == DFmode].neg;
7103           return false;
7104         }
7105
7106       return false;
7107
7108     case CLRSB:
7109     case CLZ:
7110       if (speed)
7111         {
7112           if (VECTOR_MODE_P (mode))
7113             *cost += extra_cost->vect.alu;
7114           else
7115             *cost += extra_cost->alu.clz;
7116         }
7117
7118       return false;
7119
7120     case COMPARE:
7121       op0 = XEXP (x, 0);
7122       op1 = XEXP (x, 1);
7123
7124       if (op1 == const0_rtx
7125           && GET_CODE (op0) == AND)
7126         {
7127           x = op0;
7128           mode = GET_MODE (op0);
7129           goto cost_logic;
7130         }
7131
7132       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7133         {
7134           /* TODO: A write to the CC flags possibly costs extra, this
7135              needs encoding in the cost tables.  */
7136
7137           mode = GET_MODE (op0);
7138           /* ANDS.  */
7139           if (GET_CODE (op0) == AND)
7140             {
7141               x = op0;
7142               goto cost_logic;
7143             }
7144
7145           if (GET_CODE (op0) == PLUS)
7146             {
7147               /* ADDS (and CMN alias).  */
7148               x = op0;
7149               goto cost_plus;
7150             }
7151
7152           if (GET_CODE (op0) == MINUS)
7153             {
7154               /* SUBS.  */
7155               x = op0;
7156               goto cost_minus;
7157             }
7158
7159           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7160               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7161               && CONST_INT_P (XEXP (op0, 2)))
7162             {
7163               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7164                  Handle it here directly rather than going to cost_logic
7165                  since we know the immediate generated for the TST is valid
7166                  so we can avoid creating an intermediate rtx for it only
7167                  for costing purposes.  */
7168               if (speed)
7169                 *cost += extra_cost->alu.logical;
7170
7171               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7172                                  ZERO_EXTRACT, 0, speed);
7173               return true;
7174             }
7175
7176           if (GET_CODE (op1) == NEG)
7177             {
7178               /* CMN.  */
7179               if (speed)
7180                 *cost += extra_cost->alu.arith;
7181
7182               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7183               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7184               return true;
7185             }
7186
7187           /* CMP.
7188
7189              Compare can freely swap the order of operands, and
7190              canonicalization puts the more complex operation first.
7191              But the integer MINUS logic expects the shift/extend
7192              operation in op1.  */
7193           if (! (REG_P (op0)
7194                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7195           {
7196             op0 = XEXP (x, 1);
7197             op1 = XEXP (x, 0);
7198           }
7199           goto cost_minus;
7200         }
7201
7202       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7203         {
7204           /* FCMP.  */
7205           if (speed)
7206             *cost += extra_cost->fp[mode == DFmode].compare;
7207
7208           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7209             {
7210               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7211               /* FCMP supports constant 0.0 for no extra cost. */
7212               return true;
7213             }
7214           return false;
7215         }
7216
7217       if (VECTOR_MODE_P (mode))
7218         {
7219           /* Vector compare.  */
7220           if (speed)
7221             *cost += extra_cost->vect.alu;
7222
7223           if (aarch64_float_const_zero_rtx_p (op1))
7224             {
7225               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7226                  cost.  */
7227               return true;
7228             }
7229           return false;
7230         }
7231       return false;
7232
7233     case MINUS:
7234       {
7235         op0 = XEXP (x, 0);
7236         op1 = XEXP (x, 1);
7237
7238 cost_minus:
7239         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7240
7241         /* Detect valid immediates.  */
7242         if ((GET_MODE_CLASS (mode) == MODE_INT
7243              || (GET_MODE_CLASS (mode) == MODE_CC
7244                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7245             && CONST_INT_P (op1)
7246             && aarch64_uimm12_shift (INTVAL (op1)))
7247           {
7248             if (speed)
7249               /* SUB(S) (immediate).  */
7250               *cost += extra_cost->alu.arith;
7251             return true;
7252           }
7253
7254         /* Look for SUB (extended register).  */
7255         if (aarch64_rtx_arith_op_extract_p (op1, mode))
7256           {
7257             if (speed)
7258               *cost += extra_cost->alu.extend_arith;
7259
7260             op1 = aarch64_strip_extend (op1, true);
7261             *cost += rtx_cost (op1, VOIDmode,
7262                                (enum rtx_code) GET_CODE (op1), 0, speed);
7263             return true;
7264           }
7265
7266         rtx new_op1 = aarch64_strip_extend (op1, false);
7267
7268         /* Cost this as an FMA-alike operation.  */
7269         if ((GET_CODE (new_op1) == MULT
7270              || aarch64_shift_p (GET_CODE (new_op1)))
7271             && code != COMPARE)
7272           {
7273             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7274                                             (enum rtx_code) code,
7275                                             speed);
7276             return true;
7277           }
7278
7279         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7280
7281         if (speed)
7282           {
7283             if (VECTOR_MODE_P (mode))
7284               {
7285                 /* Vector SUB.  */
7286                 *cost += extra_cost->vect.alu;
7287               }
7288             else if (GET_MODE_CLASS (mode) == MODE_INT)
7289               {
7290                 /* SUB(S).  */
7291                 *cost += extra_cost->alu.arith;
7292               }
7293             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7294               {
7295                 /* FSUB.  */
7296                 *cost += extra_cost->fp[mode == DFmode].addsub;
7297               }
7298           }
7299         return true;
7300       }
7301
7302     case PLUS:
7303       {
7304         rtx new_op0;
7305
7306         op0 = XEXP (x, 0);
7307         op1 = XEXP (x, 1);
7308
7309 cost_plus:
7310         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7311             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7312           {
7313             /* CSINC.  */
7314             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7315             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7316             return true;
7317           }
7318
7319         if (GET_MODE_CLASS (mode) == MODE_INT
7320             && CONST_INT_P (op1)
7321             && aarch64_uimm12_shift (INTVAL (op1)))
7322           {
7323             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7324
7325             if (speed)
7326               /* ADD (immediate).  */
7327               *cost += extra_cost->alu.arith;
7328             return true;
7329           }
7330
7331         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7332
7333         /* Look for ADD (extended register).  */
7334         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7335           {
7336             if (speed)
7337               *cost += extra_cost->alu.extend_arith;
7338
7339             op0 = aarch64_strip_extend (op0, true);
7340             *cost += rtx_cost (op0, VOIDmode,
7341                                (enum rtx_code) GET_CODE (op0), 0, speed);
7342             return true;
7343           }
7344
7345         /* Strip any extend, leave shifts behind as we will
7346            cost them through mult_cost.  */
7347         new_op0 = aarch64_strip_extend (op0, false);
7348
7349         if (GET_CODE (new_op0) == MULT
7350             || aarch64_shift_p (GET_CODE (new_op0)))
7351           {
7352             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7353                                             speed);
7354             return true;
7355           }
7356
7357         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7358
7359         if (speed)
7360           {
7361             if (VECTOR_MODE_P (mode))
7362               {
7363                 /* Vector ADD.  */
7364                 *cost += extra_cost->vect.alu;
7365               }
7366             else if (GET_MODE_CLASS (mode) == MODE_INT)
7367               {
7368                 /* ADD.  */
7369                 *cost += extra_cost->alu.arith;
7370               }
7371             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7372               {
7373                 /* FADD.  */
7374                 *cost += extra_cost->fp[mode == DFmode].addsub;
7375               }
7376           }
7377         return true;
7378       }
7379
7380     case BSWAP:
7381       *cost = COSTS_N_INSNS (1);
7382
7383       if (speed)
7384         {
7385           if (VECTOR_MODE_P (mode))
7386             *cost += extra_cost->vect.alu;
7387           else
7388             *cost += extra_cost->alu.rev;
7389         }
7390       return false;
7391
7392     case IOR:
7393       if (aarch_rev16_p (x))
7394         {
7395           *cost = COSTS_N_INSNS (1);
7396
7397           if (speed)
7398             {
7399               if (VECTOR_MODE_P (mode))
7400                 *cost += extra_cost->vect.alu;
7401               else
7402                 *cost += extra_cost->alu.rev;
7403             }
7404           return true;
7405         }
7406
7407       if (aarch64_extr_rtx_p (x, &op0, &op1))
7408         {
7409           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7410           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7411           if (speed)
7412             *cost += extra_cost->alu.shift;
7413
7414           return true;
7415         }
7416     /* Fall through.  */
7417     case XOR:
7418     case AND:
7419     cost_logic:
7420       op0 = XEXP (x, 0);
7421       op1 = XEXP (x, 1);
7422
7423       if (VECTOR_MODE_P (mode))
7424         {
7425           if (speed)
7426             *cost += extra_cost->vect.alu;
7427           return true;
7428         }
7429
7430       if (code == AND
7431           && GET_CODE (op0) == MULT
7432           && CONST_INT_P (XEXP (op0, 1))
7433           && CONST_INT_P (op1)
7434           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7435                                INTVAL (op1)) != 0)
7436         {
7437           /* This is a UBFM/SBFM.  */
7438           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7439           if (speed)
7440             *cost += extra_cost->alu.bfx;
7441           return true;
7442         }
7443
7444       if (is_int_mode (mode, &int_mode))
7445         {
7446           if (CONST_INT_P (op1))
7447             {
7448               /* We have a mask + shift version of a UBFIZ
7449                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7450               if (GET_CODE (op0) == ASHIFT
7451                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7452                                                          XEXP (op0, 1)))
7453                 {
7454                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7455                                      (enum rtx_code) code, 0, speed);
7456                   if (speed)
7457                     *cost += extra_cost->alu.bfx;
7458
7459                   return true;
7460                 }
7461               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7462                 {
7463                 /* We possibly get the immediate for free, this is not
7464                    modelled.  */
7465                   *cost += rtx_cost (op0, int_mode,
7466                                      (enum rtx_code) code, 0, speed);
7467                   if (speed)
7468                     *cost += extra_cost->alu.logical;
7469
7470                   return true;
7471                 }
7472             }
7473           else
7474             {
7475               rtx new_op0 = op0;
7476
7477               /* Handle ORN, EON, or BIC.  */
7478               if (GET_CODE (op0) == NOT)
7479                 op0 = XEXP (op0, 0);
7480
7481               new_op0 = aarch64_strip_shift (op0);
7482
7483               /* If we had a shift on op0 then this is a logical-shift-
7484                  by-register/immediate operation.  Otherwise, this is just
7485                  a logical operation.  */
7486               if (speed)
7487                 {
7488                   if (new_op0 != op0)
7489                     {
7490                       /* Shift by immediate.  */
7491                       if (CONST_INT_P (XEXP (op0, 1)))
7492                         *cost += extra_cost->alu.log_shift;
7493                       else
7494                         *cost += extra_cost->alu.log_shift_reg;
7495                     }
7496                   else
7497                     *cost += extra_cost->alu.logical;
7498                 }
7499
7500               /* In both cases we want to cost both operands.  */
7501               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7502                                  0, speed);
7503               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7504                                  1, speed);
7505
7506               return true;
7507             }
7508         }
7509       return false;
7510
7511     case NOT:
7512       x = XEXP (x, 0);
7513       op0 = aarch64_strip_shift (x);
7514
7515       if (VECTOR_MODE_P (mode))
7516         {
7517           /* Vector NOT.  */
7518           *cost += extra_cost->vect.alu;
7519           return false;
7520         }
7521
7522       /* MVN-shifted-reg.  */
7523       if (op0 != x)
7524         {
7525           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7526
7527           if (speed)
7528             *cost += extra_cost->alu.log_shift;
7529
7530           return true;
7531         }
7532       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7533          Handle the second form here taking care that 'a' in the above can
7534          be a shift.  */
7535       else if (GET_CODE (op0) == XOR)
7536         {
7537           rtx newop0 = XEXP (op0, 0);
7538           rtx newop1 = XEXP (op0, 1);
7539           rtx op0_stripped = aarch64_strip_shift (newop0);
7540
7541           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7542           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7543
7544           if (speed)
7545             {
7546               if (op0_stripped != newop0)
7547                 *cost += extra_cost->alu.log_shift;
7548               else
7549                 *cost += extra_cost->alu.logical;
7550             }
7551
7552           return true;
7553         }
7554       /* MVN.  */
7555       if (speed)
7556         *cost += extra_cost->alu.logical;
7557
7558       return false;
7559
7560     case ZERO_EXTEND:
7561
7562       op0 = XEXP (x, 0);
7563       /* If a value is written in SI mode, then zero extended to DI
7564          mode, the operation will in general be free as a write to
7565          a 'w' register implicitly zeroes the upper bits of an 'x'
7566          register.  However, if this is
7567
7568            (set (reg) (zero_extend (reg)))
7569
7570          we must cost the explicit register move.  */
7571       if (mode == DImode
7572           && GET_MODE (op0) == SImode
7573           && outer == SET)
7574         {
7575           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7576
7577         /* If OP_COST is non-zero, then the cost of the zero extend
7578            is effectively the cost of the inner operation.  Otherwise
7579            we have a MOV instruction and we take the cost from the MOV
7580            itself.  This is true independently of whether we are
7581            optimizing for space or time.  */
7582           if (op_cost)
7583             *cost = op_cost;
7584
7585           return true;
7586         }
7587       else if (MEM_P (op0))
7588         {
7589           /* All loads can zero extend to any size for free.  */
7590           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7591           return true;
7592         }
7593
7594       op0 = aarch64_extend_bitfield_pattern_p (x);
7595       if (op0)
7596         {
7597           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7598           if (speed)
7599             *cost += extra_cost->alu.bfx;
7600           return true;
7601         }
7602
7603       if (speed)
7604         {
7605           if (VECTOR_MODE_P (mode))
7606             {
7607               /* UMOV.  */
7608               *cost += extra_cost->vect.alu;
7609             }
7610           else
7611             {
7612               /* We generate an AND instead of UXTB/UXTH.  */
7613               *cost += extra_cost->alu.logical;
7614             }
7615         }
7616       return false;
7617
7618     case SIGN_EXTEND:
7619       if (MEM_P (XEXP (x, 0)))
7620         {
7621           /* LDRSH.  */
7622           if (speed)
7623             {
7624               rtx address = XEXP (XEXP (x, 0), 0);
7625               *cost += extra_cost->ldst.load_sign_extend;
7626
7627               *cost +=
7628                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7629                                                      0, speed));
7630             }
7631           return true;
7632         }
7633
7634       op0 = aarch64_extend_bitfield_pattern_p (x);
7635       if (op0)
7636         {
7637           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7638           if (speed)
7639             *cost += extra_cost->alu.bfx;
7640           return true;
7641         }
7642
7643       if (speed)
7644         {
7645           if (VECTOR_MODE_P (mode))
7646             *cost += extra_cost->vect.alu;
7647           else
7648             *cost += extra_cost->alu.extend;
7649         }
7650       return false;
7651
7652     case ASHIFT:
7653       op0 = XEXP (x, 0);
7654       op1 = XEXP (x, 1);
7655
7656       if (CONST_INT_P (op1))
7657         {
7658           if (speed)
7659             {
7660               if (VECTOR_MODE_P (mode))
7661                 {
7662                   /* Vector shift (immediate).  */
7663                   *cost += extra_cost->vect.alu;
7664                 }
7665               else
7666                 {
7667                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7668                      aliases.  */
7669                   *cost += extra_cost->alu.shift;
7670                 }
7671             }
7672
7673           /* We can incorporate zero/sign extend for free.  */
7674           if (GET_CODE (op0) == ZERO_EXTEND
7675               || GET_CODE (op0) == SIGN_EXTEND)
7676             op0 = XEXP (op0, 0);
7677
7678           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7679           return true;
7680         }
7681       else
7682         {
7683           if (VECTOR_MODE_P (mode))
7684             {
7685               if (speed)
7686                 /* Vector shift (register).  */
7687                 *cost += extra_cost->vect.alu;
7688             }
7689           else
7690             {
7691               if (speed)
7692                 /* LSLV.  */
7693                 *cost += extra_cost->alu.shift_reg;
7694
7695               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7696                   && CONST_INT_P (XEXP (op1, 1))
7697                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7698                 {
7699                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7700                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7701                      don't recurse into it.  */
7702                   return true;
7703                 }
7704             }
7705           return false;  /* All arguments need to be in registers.  */
7706         }
7707
7708     case ROTATE:
7709     case ROTATERT:
7710     case LSHIFTRT:
7711     case ASHIFTRT:
7712       op0 = XEXP (x, 0);
7713       op1 = XEXP (x, 1);
7714
7715       if (CONST_INT_P (op1))
7716         {
7717           /* ASR (immediate) and friends.  */
7718           if (speed)
7719             {
7720               if (VECTOR_MODE_P (mode))
7721                 *cost += extra_cost->vect.alu;
7722               else
7723                 *cost += extra_cost->alu.shift;
7724             }
7725
7726           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7727           return true;
7728         }
7729       else
7730         {
7731           if (VECTOR_MODE_P (mode))
7732             {
7733               if (speed)
7734                 /* Vector shift (register).  */
7735                 *cost += extra_cost->vect.alu;
7736             }
7737           else
7738             {
7739               if (speed)
7740                 /* ASR (register) and friends.  */
7741                 *cost += extra_cost->alu.shift_reg;
7742
7743               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7744                   && CONST_INT_P (XEXP (op1, 1))
7745                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7746                 {
7747                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7748                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7749                      don't recurse into it.  */
7750                   return true;
7751                 }
7752             }
7753           return false;  /* All arguments need to be in registers.  */
7754         }
7755
7756     case SYMBOL_REF:
7757
7758       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7759           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7760         {
7761           /* LDR.  */
7762           if (speed)
7763             *cost += extra_cost->ldst.load;
7764         }
7765       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7766                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7767         {
7768           /* ADRP, followed by ADD.  */
7769           *cost += COSTS_N_INSNS (1);
7770           if (speed)
7771             *cost += 2 * extra_cost->alu.arith;
7772         }
7773       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7774                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7775         {
7776           /* ADR.  */
7777           if (speed)
7778             *cost += extra_cost->alu.arith;
7779         }
7780
7781       if (flag_pic)
7782         {
7783           /* One extra load instruction, after accessing the GOT.  */
7784           *cost += COSTS_N_INSNS (1);
7785           if (speed)
7786             *cost += extra_cost->ldst.load;
7787         }
7788       return true;
7789
7790     case HIGH:
7791     case LO_SUM:
7792       /* ADRP/ADD (immediate).  */
7793       if (speed)
7794         *cost += extra_cost->alu.arith;
7795       return true;
7796
7797     case ZERO_EXTRACT:
7798     case SIGN_EXTRACT:
7799       /* UBFX/SBFX.  */
7800       if (speed)
7801         {
7802           if (VECTOR_MODE_P (mode))
7803             *cost += extra_cost->vect.alu;
7804           else
7805             *cost += extra_cost->alu.bfx;
7806         }
7807
7808       /* We can trust that the immediates used will be correct (there
7809          are no by-register forms), so we need only cost op0.  */
7810       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7811       return true;
7812
7813     case MULT:
7814       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7815       /* aarch64_rtx_mult_cost always handles recursion to its
7816          operands.  */
7817       return true;
7818
7819     case MOD:
7820     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7821        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7822        an unconditional negate.  This case should only ever be reached through
7823        the set_smod_pow2_cheap check in expmed.c.  */
7824       if (CONST_INT_P (XEXP (x, 1))
7825           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7826           && (mode == SImode || mode == DImode))
7827         {
7828           /* We expand to 4 instructions.  Reset the baseline.  */
7829           *cost = COSTS_N_INSNS (4);
7830
7831           if (speed)
7832             *cost += 2 * extra_cost->alu.logical
7833                      + 2 * extra_cost->alu.arith;
7834
7835           return true;
7836         }
7837
7838     /* Fall-through.  */
7839     case UMOD:
7840       if (speed)
7841         {
7842           /* Slighly prefer UMOD over SMOD.  */
7843           if (VECTOR_MODE_P (mode))
7844             *cost += extra_cost->vect.alu;
7845           else if (GET_MODE_CLASS (mode) == MODE_INT)
7846             *cost += (extra_cost->mult[mode == DImode].add
7847                       + extra_cost->mult[mode == DImode].idiv
7848                       + (code == MOD ? 1 : 0));
7849         }
7850       return false;  /* All arguments need to be in registers.  */
7851
7852     case DIV:
7853     case UDIV:
7854     case SQRT:
7855       if (speed)
7856         {
7857           if (VECTOR_MODE_P (mode))
7858             *cost += extra_cost->vect.alu;
7859           else if (GET_MODE_CLASS (mode) == MODE_INT)
7860             /* There is no integer SQRT, so only DIV and UDIV can get
7861                here.  */
7862             *cost += (extra_cost->mult[mode == DImode].idiv
7863                      /* Slighly prefer UDIV over SDIV.  */
7864                      + (code == DIV ? 1 : 0));
7865           else
7866             *cost += extra_cost->fp[mode == DFmode].div;
7867         }
7868       return false;  /* All arguments need to be in registers.  */
7869
7870     case IF_THEN_ELSE:
7871       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7872                                          XEXP (x, 2), cost, speed);
7873
7874     case EQ:
7875     case NE:
7876     case GT:
7877     case GTU:
7878     case LT:
7879     case LTU:
7880     case GE:
7881     case GEU:
7882     case LE:
7883     case LEU:
7884
7885       return false; /* All arguments must be in registers.  */
7886
7887     case FMA:
7888       op0 = XEXP (x, 0);
7889       op1 = XEXP (x, 1);
7890       op2 = XEXP (x, 2);
7891
7892       if (speed)
7893         {
7894           if (VECTOR_MODE_P (mode))
7895             *cost += extra_cost->vect.alu;
7896           else
7897             *cost += extra_cost->fp[mode == DFmode].fma;
7898         }
7899
7900       /* FMSUB, FNMADD, and FNMSUB are free.  */
7901       if (GET_CODE (op0) == NEG)
7902         op0 = XEXP (op0, 0);
7903
7904       if (GET_CODE (op2) == NEG)
7905         op2 = XEXP (op2, 0);
7906
7907       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7908          and the by-element operand as operand 0.  */
7909       if (GET_CODE (op1) == NEG)
7910         op1 = XEXP (op1, 0);
7911
7912       /* Catch vector-by-element operations.  The by-element operand can
7913          either be (vec_duplicate (vec_select (x))) or just
7914          (vec_select (x)), depending on whether we are multiplying by
7915          a vector or a scalar.
7916
7917          Canonicalization is not very good in these cases, FMA4 will put the
7918          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7919       if (GET_CODE (op0) == VEC_DUPLICATE)
7920         op0 = XEXP (op0, 0);
7921       else if (GET_CODE (op1) == VEC_DUPLICATE)
7922         op1 = XEXP (op1, 0);
7923
7924       if (GET_CODE (op0) == VEC_SELECT)
7925         op0 = XEXP (op0, 0);
7926       else if (GET_CODE (op1) == VEC_SELECT)
7927         op1 = XEXP (op1, 0);
7928
7929       /* If the remaining parameters are not registers,
7930          get the cost to put them into registers.  */
7931       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7932       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7933       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7934       return true;
7935
7936     case FLOAT:
7937     case UNSIGNED_FLOAT:
7938       if (speed)
7939         *cost += extra_cost->fp[mode == DFmode].fromint;
7940       return false;
7941
7942     case FLOAT_EXTEND:
7943       if (speed)
7944         {
7945           if (VECTOR_MODE_P (mode))
7946             {
7947               /*Vector truncate.  */
7948               *cost += extra_cost->vect.alu;
7949             }
7950           else
7951             *cost += extra_cost->fp[mode == DFmode].widen;
7952         }
7953       return false;
7954
7955     case FLOAT_TRUNCATE:
7956       if (speed)
7957         {
7958           if (VECTOR_MODE_P (mode))
7959             {
7960               /*Vector conversion.  */
7961               *cost += extra_cost->vect.alu;
7962             }
7963           else
7964             *cost += extra_cost->fp[mode == DFmode].narrow;
7965         }
7966       return false;
7967
7968     case FIX:
7969     case UNSIGNED_FIX:
7970       x = XEXP (x, 0);
7971       /* Strip the rounding part.  They will all be implemented
7972          by the fcvt* family of instructions anyway.  */
7973       if (GET_CODE (x) == UNSPEC)
7974         {
7975           unsigned int uns_code = XINT (x, 1);
7976
7977           if (uns_code == UNSPEC_FRINTA
7978               || uns_code == UNSPEC_FRINTM
7979               || uns_code == UNSPEC_FRINTN
7980               || uns_code == UNSPEC_FRINTP
7981               || uns_code == UNSPEC_FRINTZ)
7982             x = XVECEXP (x, 0, 0);
7983         }
7984
7985       if (speed)
7986         {
7987           if (VECTOR_MODE_P (mode))
7988             *cost += extra_cost->vect.alu;
7989           else
7990             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7991         }
7992
7993       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7994          fixed-point fcvt.  */
7995       if (GET_CODE (x) == MULT
7996           && ((VECTOR_MODE_P (mode)
7997                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7998               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7999         {
8000           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8001                              0, speed);
8002           return true;
8003         }
8004
8005       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8006       return true;
8007
8008     case ABS:
8009       if (VECTOR_MODE_P (mode))
8010         {
8011           /* ABS (vector).  */
8012           if (speed)
8013             *cost += extra_cost->vect.alu;
8014         }
8015       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8016         {
8017           op0 = XEXP (x, 0);
8018
8019           /* FABD, which is analogous to FADD.  */
8020           if (GET_CODE (op0) == MINUS)
8021             {
8022               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8023               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8024               if (speed)
8025                 *cost += extra_cost->fp[mode == DFmode].addsub;
8026
8027               return true;
8028             }
8029           /* Simple FABS is analogous to FNEG.  */
8030           if (speed)
8031             *cost += extra_cost->fp[mode == DFmode].neg;
8032         }
8033       else
8034         {
8035           /* Integer ABS will either be split to
8036              two arithmetic instructions, or will be an ABS
8037              (scalar), which we don't model.  */
8038           *cost = COSTS_N_INSNS (2);
8039           if (speed)
8040             *cost += 2 * extra_cost->alu.arith;
8041         }
8042       return false;
8043
8044     case SMAX:
8045     case SMIN:
8046       if (speed)
8047         {
8048           if (VECTOR_MODE_P (mode))
8049             *cost += extra_cost->vect.alu;
8050           else
8051             {
8052               /* FMAXNM/FMINNM/FMAX/FMIN.
8053                  TODO: This may not be accurate for all implementations, but
8054                  we do not model this in the cost tables.  */
8055               *cost += extra_cost->fp[mode == DFmode].addsub;
8056             }
8057         }
8058       return false;
8059
8060     case UNSPEC:
8061       /* The floating point round to integer frint* instructions.  */
8062       if (aarch64_frint_unspec_p (XINT (x, 1)))
8063         {
8064           if (speed)
8065             *cost += extra_cost->fp[mode == DFmode].roundint;
8066
8067           return false;
8068         }
8069
8070       if (XINT (x, 1) == UNSPEC_RBIT)
8071         {
8072           if (speed)
8073             *cost += extra_cost->alu.rev;
8074
8075           return false;
8076         }
8077       break;
8078
8079     case TRUNCATE:
8080
8081       /* Decompose <su>muldi3_highpart.  */
8082       if (/* (truncate:DI  */
8083           mode == DImode
8084           /*   (lshiftrt:TI  */
8085           && GET_MODE (XEXP (x, 0)) == TImode
8086           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8087           /*      (mult:TI  */
8088           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8089           /*        (ANY_EXTEND:TI (reg:DI))
8090                     (ANY_EXTEND:TI (reg:DI)))  */
8091           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8092                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8093               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8094                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8095           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8096           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8097           /*     (const_int 64)  */
8098           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8099           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8100         {
8101           /* UMULH/SMULH.  */
8102           if (speed)
8103             *cost += extra_cost->mult[mode == DImode].extend;
8104           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8105                              mode, MULT, 0, speed);
8106           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8107                              mode, MULT, 1, speed);
8108           return true;
8109         }
8110
8111       /* Fall through.  */
8112     default:
8113       break;
8114     }
8115
8116   if (dump_file
8117       && flag_aarch64_verbose_cost)
8118     fprintf (dump_file,
8119       "\nFailed to cost RTX.  Assuming default cost.\n");
8120
8121   return true;
8122 }
8123
8124 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8125    calculated for X.  This cost is stored in *COST.  Returns true
8126    if the total cost of X was calculated.  */
8127 static bool
8128 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8129                    int param, int *cost, bool speed)
8130 {
8131   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8132
8133   if (dump_file
8134       && flag_aarch64_verbose_cost)
8135     {
8136       print_rtl_single (dump_file, x);
8137       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8138                speed ? "Hot" : "Cold",
8139                *cost, result ? "final" : "partial");
8140     }
8141
8142   return result;
8143 }
8144
8145 static int
8146 aarch64_register_move_cost (machine_mode mode,
8147                             reg_class_t from_i, reg_class_t to_i)
8148 {
8149   enum reg_class from = (enum reg_class) from_i;
8150   enum reg_class to = (enum reg_class) to_i;
8151   const struct cpu_regmove_cost *regmove_cost
8152     = aarch64_tune_params.regmove_cost;
8153
8154   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8155   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8156     to = GENERAL_REGS;
8157
8158   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8159     from = GENERAL_REGS;
8160
8161   /* Moving between GPR and stack cost is the same as GP2GP.  */
8162   if ((from == GENERAL_REGS && to == STACK_REG)
8163       || (to == GENERAL_REGS && from == STACK_REG))
8164     return regmove_cost->GP2GP;
8165
8166   /* To/From the stack register, we move via the gprs.  */
8167   if (to == STACK_REG || from == STACK_REG)
8168     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8169             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8170
8171   if (GET_MODE_SIZE (mode) == 16)
8172     {
8173       /* 128-bit operations on general registers require 2 instructions.  */
8174       if (from == GENERAL_REGS && to == GENERAL_REGS)
8175         return regmove_cost->GP2GP * 2;
8176       else if (from == GENERAL_REGS)
8177         return regmove_cost->GP2FP * 2;
8178       else if (to == GENERAL_REGS)
8179         return regmove_cost->FP2GP * 2;
8180
8181       /* When AdvSIMD instructions are disabled it is not possible to move
8182          a 128-bit value directly between Q registers.  This is handled in
8183          secondary reload.  A general register is used as a scratch to move
8184          the upper DI value and the lower DI value is moved directly,
8185          hence the cost is the sum of three moves. */
8186       if (! TARGET_SIMD)
8187         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8188
8189       return regmove_cost->FP2FP;
8190     }
8191
8192   if (from == GENERAL_REGS && to == GENERAL_REGS)
8193     return regmove_cost->GP2GP;
8194   else if (from == GENERAL_REGS)
8195     return regmove_cost->GP2FP;
8196   else if (to == GENERAL_REGS)
8197     return regmove_cost->FP2GP;
8198
8199   return regmove_cost->FP2FP;
8200 }
8201
8202 static int
8203 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8204                           reg_class_t rclass ATTRIBUTE_UNUSED,
8205                           bool in ATTRIBUTE_UNUSED)
8206 {
8207   return aarch64_tune_params.memmov_cost;
8208 }
8209
8210 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8211    to optimize 1.0/sqrt.  */
8212
8213 static bool
8214 use_rsqrt_p (machine_mode mode)
8215 {
8216   return (!flag_trapping_math
8217           && flag_unsafe_math_optimizations
8218           && ((aarch64_tune_params.approx_modes->recip_sqrt
8219                & AARCH64_APPROX_MODE (mode))
8220               || flag_mrecip_low_precision_sqrt));
8221 }
8222
8223 /* Function to decide when to use the approximate reciprocal square root
8224    builtin.  */
8225
8226 static tree
8227 aarch64_builtin_reciprocal (tree fndecl)
8228 {
8229   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8230
8231   if (!use_rsqrt_p (mode))
8232     return NULL_TREE;
8233   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8234 }
8235
8236 typedef rtx (*rsqrte_type) (rtx, rtx);
8237
8238 /* Select reciprocal square root initial estimate insn depending on machine
8239    mode.  */
8240
8241 static rsqrte_type
8242 get_rsqrte_type (machine_mode mode)
8243 {
8244   switch (mode)
8245   {
8246     case E_DFmode:   return gen_aarch64_rsqrtedf;
8247     case E_SFmode:   return gen_aarch64_rsqrtesf;
8248     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8249     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8250     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8251     default: gcc_unreachable ();
8252   }
8253 }
8254
8255 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8256
8257 /* Select reciprocal square root series step insn depending on machine mode.  */
8258
8259 static rsqrts_type
8260 get_rsqrts_type (machine_mode mode)
8261 {
8262   switch (mode)
8263   {
8264     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8265     case E_SFmode:   return gen_aarch64_rsqrtssf;
8266     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8267     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8268     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8269     default: gcc_unreachable ();
8270   }
8271 }
8272
8273 /* Emit instruction sequence to compute either the approximate square root
8274    or its approximate reciprocal, depending on the flag RECP, and return
8275    whether the sequence was emitted or not.  */
8276
8277 bool
8278 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8279 {
8280   machine_mode mode = GET_MODE (dst);
8281
8282   if (GET_MODE_INNER (mode) == HFmode)
8283     {
8284       gcc_assert (!recp);
8285       return false;
8286     }
8287
8288   machine_mode mmsk
8289     = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)).require (),
8290                        GET_MODE_NUNITS (mode));
8291   if (!recp)
8292     {
8293       if (!(flag_mlow_precision_sqrt
8294             || (aarch64_tune_params.approx_modes->sqrt
8295                 & AARCH64_APPROX_MODE (mode))))
8296         return false;
8297
8298       if (flag_finite_math_only
8299           || flag_trapping_math
8300           || !flag_unsafe_math_optimizations
8301           || optimize_function_for_size_p (cfun))
8302         return false;
8303     }
8304   else
8305     /* Caller assumes we cannot fail.  */
8306     gcc_assert (use_rsqrt_p (mode));
8307
8308
8309   rtx xmsk = gen_reg_rtx (mmsk);
8310   if (!recp)
8311     /* When calculating the approximate square root, compare the
8312        argument with 0.0 and create a mask.  */
8313     emit_insn (gen_rtx_SET (xmsk,
8314                             gen_rtx_NEG (mmsk,
8315                                          gen_rtx_EQ (mmsk, src,
8316                                                      CONST0_RTX (mode)))));
8317
8318   /* Estimate the approximate reciprocal square root.  */
8319   rtx xdst = gen_reg_rtx (mode);
8320   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8321
8322   /* Iterate over the series twice for SF and thrice for DF.  */
8323   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8324
8325   /* Optionally iterate over the series once less for faster performance
8326      while sacrificing the accuracy.  */
8327   if ((recp && flag_mrecip_low_precision_sqrt)
8328       || (!recp && flag_mlow_precision_sqrt))
8329     iterations--;
8330
8331   /* Iterate over the series to calculate the approximate reciprocal square
8332      root.  */
8333   rtx x1 = gen_reg_rtx (mode);
8334   while (iterations--)
8335     {
8336       rtx x2 = gen_reg_rtx (mode);
8337       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8338
8339       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8340
8341       if (iterations > 0)
8342         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8343     }
8344
8345   if (!recp)
8346     {
8347       /* Qualify the approximate reciprocal square root when the argument is
8348          0.0 by squashing the intermediary result to 0.0.  */
8349       rtx xtmp = gen_reg_rtx (mmsk);
8350       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8351                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8352       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8353
8354       /* Calculate the approximate square root.  */
8355       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8356     }
8357
8358   /* Finalize the approximation.  */
8359   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8360
8361   return true;
8362 }
8363
8364 typedef rtx (*recpe_type) (rtx, rtx);
8365
8366 /* Select reciprocal initial estimate insn depending on machine mode.  */
8367
8368 static recpe_type
8369 get_recpe_type (machine_mode mode)
8370 {
8371   switch (mode)
8372   {
8373     case E_SFmode:   return (gen_aarch64_frecpesf);
8374     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8375     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8376     case E_DFmode:   return (gen_aarch64_frecpedf);
8377     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8378     default:         gcc_unreachable ();
8379   }
8380 }
8381
8382 typedef rtx (*recps_type) (rtx, rtx, rtx);
8383
8384 /* Select reciprocal series step insn depending on machine mode.  */
8385
8386 static recps_type
8387 get_recps_type (machine_mode mode)
8388 {
8389   switch (mode)
8390   {
8391     case E_SFmode:   return (gen_aarch64_frecpssf);
8392     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8393     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8394     case E_DFmode:   return (gen_aarch64_frecpsdf);
8395     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8396     default:         gcc_unreachable ();
8397   }
8398 }
8399
8400 /* Emit the instruction sequence to compute the approximation for the division
8401    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8402
8403 bool
8404 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8405 {
8406   machine_mode mode = GET_MODE (quo);
8407
8408   if (GET_MODE_INNER (mode) == HFmode)
8409     return false;
8410
8411   bool use_approx_division_p = (flag_mlow_precision_div
8412                                 || (aarch64_tune_params.approx_modes->division
8413                                     & AARCH64_APPROX_MODE (mode)));
8414
8415   if (!flag_finite_math_only
8416       || flag_trapping_math
8417       || !flag_unsafe_math_optimizations
8418       || optimize_function_for_size_p (cfun)
8419       || !use_approx_division_p)
8420     return false;
8421
8422   /* Estimate the approximate reciprocal.  */
8423   rtx xrcp = gen_reg_rtx (mode);
8424   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8425
8426   /* Iterate over the series twice for SF and thrice for DF.  */
8427   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8428
8429   /* Optionally iterate over the series once less for faster performance,
8430      while sacrificing the accuracy.  */
8431   if (flag_mlow_precision_div)
8432     iterations--;
8433
8434   /* Iterate over the series to calculate the approximate reciprocal.  */
8435   rtx xtmp = gen_reg_rtx (mode);
8436   while (iterations--)
8437     {
8438       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8439
8440       if (iterations > 0)
8441         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8442     }
8443
8444   if (num != CONST1_RTX (mode))
8445     {
8446       /* As the approximate reciprocal of DEN is already calculated, only
8447          calculate the approximate division when NUM is not 1.0.  */
8448       rtx xnum = force_reg (mode, num);
8449       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8450     }
8451
8452   /* Finalize the approximation.  */
8453   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8454   return true;
8455 }
8456
8457 /* Return the number of instructions that can be issued per cycle.  */
8458 static int
8459 aarch64_sched_issue_rate (void)
8460 {
8461   return aarch64_tune_params.issue_rate;
8462 }
8463
8464 static int
8465 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8466 {
8467   int issue_rate = aarch64_sched_issue_rate ();
8468
8469   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8470 }
8471
8472
8473 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8474    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8475    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8476
8477 static int
8478 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8479                                                     int ready_index)
8480 {
8481   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8482 }
8483
8484
8485 /* Vectorizer cost model target hooks.  */
8486
8487 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8488 static int
8489 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8490                                     tree vectype,
8491                                     int misalign ATTRIBUTE_UNUSED)
8492 {
8493   unsigned elements;
8494   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8495   bool fp = false;
8496
8497   if (vectype != NULL)
8498     fp = FLOAT_TYPE_P (vectype);
8499
8500   switch (type_of_cost)
8501     {
8502       case scalar_stmt:
8503         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8504
8505       case scalar_load:
8506         return costs->scalar_load_cost;
8507
8508       case scalar_store:
8509         return costs->scalar_store_cost;
8510
8511       case vector_stmt:
8512         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8513
8514       case vector_load:
8515         return costs->vec_align_load_cost;
8516
8517       case vector_store:
8518         return costs->vec_store_cost;
8519
8520       case vec_to_scalar:
8521         return costs->vec_to_scalar_cost;
8522
8523       case scalar_to_vec:
8524         return costs->scalar_to_vec_cost;
8525
8526       case unaligned_load:
8527         return costs->vec_unalign_load_cost;
8528
8529       case unaligned_store:
8530         return costs->vec_unalign_store_cost;
8531
8532       case cond_branch_taken:
8533         return costs->cond_taken_branch_cost;
8534
8535       case cond_branch_not_taken:
8536         return costs->cond_not_taken_branch_cost;
8537
8538       case vec_perm:
8539         return costs->vec_permute_cost;
8540
8541       case vec_promote_demote:
8542         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8543
8544       case vec_construct:
8545         elements = TYPE_VECTOR_SUBPARTS (vectype);
8546         return elements / 2 + 1;
8547
8548       default:
8549         gcc_unreachable ();
8550     }
8551 }
8552
8553 /* Implement targetm.vectorize.add_stmt_cost.  */
8554 static unsigned
8555 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8556                        struct _stmt_vec_info *stmt_info, int misalign,
8557                        enum vect_cost_model_location where)
8558 {
8559   unsigned *cost = (unsigned *) data;
8560   unsigned retval = 0;
8561
8562   if (flag_vect_cost_model)
8563     {
8564       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8565       int stmt_cost =
8566             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8567
8568       /* Statements in an inner loop relative to the loop being
8569          vectorized are weighted more heavily.  The value here is
8570          arbitrary and could potentially be improved with analysis.  */
8571       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8572         count *= 50; /*  FIXME  */
8573
8574       retval = (unsigned) (count * stmt_cost);
8575       cost[where] += retval;
8576     }
8577
8578   return retval;
8579 }
8580
8581 static void initialize_aarch64_code_model (struct gcc_options *);
8582
8583 /* Parse the TO_PARSE string and put the architecture struct that it
8584    selects into RES and the architectural features into ISA_FLAGS.
8585    Return an aarch64_parse_opt_result describing the parse result.
8586    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8587
8588 static enum aarch64_parse_opt_result
8589 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8590                     unsigned long *isa_flags)
8591 {
8592   char *ext;
8593   const struct processor *arch;
8594   char *str = (char *) alloca (strlen (to_parse) + 1);
8595   size_t len;
8596
8597   strcpy (str, to_parse);
8598
8599   ext = strchr (str, '+');
8600
8601   if (ext != NULL)
8602     len = ext - str;
8603   else
8604     len = strlen (str);
8605
8606   if (len == 0)
8607     return AARCH64_PARSE_MISSING_ARG;
8608
8609
8610   /* Loop through the list of supported ARCHes to find a match.  */
8611   for (arch = all_architectures; arch->name != NULL; arch++)
8612     {
8613       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8614         {
8615           unsigned long isa_temp = arch->flags;
8616
8617           if (ext != NULL)
8618             {
8619               /* TO_PARSE string contains at least one extension.  */
8620               enum aarch64_parse_opt_result ext_res
8621                 = aarch64_parse_extension (ext, &isa_temp);
8622
8623               if (ext_res != AARCH64_PARSE_OK)
8624                 return ext_res;
8625             }
8626           /* Extension parsing was successful.  Confirm the result
8627              arch and ISA flags.  */
8628           *res = arch;
8629           *isa_flags = isa_temp;
8630           return AARCH64_PARSE_OK;
8631         }
8632     }
8633
8634   /* ARCH name not found in list.  */
8635   return AARCH64_PARSE_INVALID_ARG;
8636 }
8637
8638 /* Parse the TO_PARSE string and put the result tuning in RES and the
8639    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8640    describing the parse result.  If there is an error parsing, RES and
8641    ISA_FLAGS are left unchanged.  */
8642
8643 static enum aarch64_parse_opt_result
8644 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8645                    unsigned long *isa_flags)
8646 {
8647   char *ext;
8648   const struct processor *cpu;
8649   char *str = (char *) alloca (strlen (to_parse) + 1);
8650   size_t len;
8651
8652   strcpy (str, to_parse);
8653
8654   ext = strchr (str, '+');
8655
8656   if (ext != NULL)
8657     len = ext - str;
8658   else
8659     len = strlen (str);
8660
8661   if (len == 0)
8662     return AARCH64_PARSE_MISSING_ARG;
8663
8664
8665   /* Loop through the list of supported CPUs to find a match.  */
8666   for (cpu = all_cores; cpu->name != NULL; cpu++)
8667     {
8668       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8669         {
8670           unsigned long isa_temp = cpu->flags;
8671
8672
8673           if (ext != NULL)
8674             {
8675               /* TO_PARSE string contains at least one extension.  */
8676               enum aarch64_parse_opt_result ext_res
8677                 = aarch64_parse_extension (ext, &isa_temp);
8678
8679               if (ext_res != AARCH64_PARSE_OK)
8680                 return ext_res;
8681             }
8682           /* Extension parsing was successfull.  Confirm the result
8683              cpu and ISA flags.  */
8684           *res = cpu;
8685           *isa_flags = isa_temp;
8686           return AARCH64_PARSE_OK;
8687         }
8688     }
8689
8690   /* CPU name not found in list.  */
8691   return AARCH64_PARSE_INVALID_ARG;
8692 }
8693
8694 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8695    Return an aarch64_parse_opt_result describing the parse result.
8696    If the parsing fails the RES does not change.  */
8697
8698 static enum aarch64_parse_opt_result
8699 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8700 {
8701   const struct processor *cpu;
8702   char *str = (char *) alloca (strlen (to_parse) + 1);
8703
8704   strcpy (str, to_parse);
8705
8706   /* Loop through the list of supported CPUs to find a match.  */
8707   for (cpu = all_cores; cpu->name != NULL; cpu++)
8708     {
8709       if (strcmp (cpu->name, str) == 0)
8710         {
8711           *res = cpu;
8712           return AARCH64_PARSE_OK;
8713         }
8714     }
8715
8716   /* CPU name not found in list.  */
8717   return AARCH64_PARSE_INVALID_ARG;
8718 }
8719
8720 /* Parse TOKEN, which has length LENGTH to see if it is an option
8721    described in FLAG.  If it is, return the index bit for that fusion type.
8722    If not, error (printing OPTION_NAME) and return zero.  */
8723
8724 static unsigned int
8725 aarch64_parse_one_option_token (const char *token,
8726                                 size_t length,
8727                                 const struct aarch64_flag_desc *flag,
8728                                 const char *option_name)
8729 {
8730   for (; flag->name != NULL; flag++)
8731     {
8732       if (length == strlen (flag->name)
8733           && !strncmp (flag->name, token, length))
8734         return flag->flag;
8735     }
8736
8737   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8738   return 0;
8739 }
8740
8741 /* Parse OPTION which is a comma-separated list of flags to enable.
8742    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8743    default state we inherit from the CPU tuning structures.  OPTION_NAME
8744    gives the top-level option we are parsing in the -moverride string,
8745    for use in error messages.  */
8746
8747 static unsigned int
8748 aarch64_parse_boolean_options (const char *option,
8749                                const struct aarch64_flag_desc *flags,
8750                                unsigned int initial_state,
8751                                const char *option_name)
8752 {
8753   const char separator = '.';
8754   const char* specs = option;
8755   const char* ntoken = option;
8756   unsigned int found_flags = initial_state;
8757
8758   while ((ntoken = strchr (specs, separator)))
8759     {
8760       size_t token_length = ntoken - specs;
8761       unsigned token_ops = aarch64_parse_one_option_token (specs,
8762                                                            token_length,
8763                                                            flags,
8764                                                            option_name);
8765       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8766          in the token stream, reset the supported operations.  So:
8767
8768            adrp+add.cmp+branch.none.adrp+add
8769
8770            would have the result of turning on only adrp+add fusion.  */
8771       if (!token_ops)
8772         found_flags = 0;
8773
8774       found_flags |= token_ops;
8775       specs = ++ntoken;
8776     }
8777
8778   /* We ended with a comma, print something.  */
8779   if (!(*specs))
8780     {
8781       error ("%s string ill-formed\n", option_name);
8782       return 0;
8783     }
8784
8785   /* We still have one more token to parse.  */
8786   size_t token_length = strlen (specs);
8787   unsigned token_ops = aarch64_parse_one_option_token (specs,
8788                                                        token_length,
8789                                                        flags,
8790                                                        option_name);
8791    if (!token_ops)
8792      found_flags = 0;
8793
8794   found_flags |= token_ops;
8795   return found_flags;
8796 }
8797
8798 /* Support for overriding instruction fusion.  */
8799
8800 static void
8801 aarch64_parse_fuse_string (const char *fuse_string,
8802                             struct tune_params *tune)
8803 {
8804   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8805                                                      aarch64_fusible_pairs,
8806                                                      tune->fusible_ops,
8807                                                      "fuse=");
8808 }
8809
8810 /* Support for overriding other tuning flags.  */
8811
8812 static void
8813 aarch64_parse_tune_string (const char *tune_string,
8814                             struct tune_params *tune)
8815 {
8816   tune->extra_tuning_flags
8817     = aarch64_parse_boolean_options (tune_string,
8818                                      aarch64_tuning_flags,
8819                                      tune->extra_tuning_flags,
8820                                      "tune=");
8821 }
8822
8823 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8824    we understand.  If it is, extract the option string and handoff to
8825    the appropriate function.  */
8826
8827 void
8828 aarch64_parse_one_override_token (const char* token,
8829                                   size_t length,
8830                                   struct tune_params *tune)
8831 {
8832   const struct aarch64_tuning_override_function *fn
8833     = aarch64_tuning_override_functions;
8834
8835   const char *option_part = strchr (token, '=');
8836   if (!option_part)
8837     {
8838       error ("tuning string missing in option (%s)", token);
8839       return;
8840     }
8841
8842   /* Get the length of the option name.  */
8843   length = option_part - token;
8844   /* Skip the '=' to get to the option string.  */
8845   option_part++;
8846
8847   for (; fn->name != NULL; fn++)
8848     {
8849       if (!strncmp (fn->name, token, length))
8850         {
8851           fn->parse_override (option_part, tune);
8852           return;
8853         }
8854     }
8855
8856   error ("unknown tuning option (%s)",token);
8857   return;
8858 }
8859
8860 /* A checking mechanism for the implementation of the tls size.  */
8861
8862 static void
8863 initialize_aarch64_tls_size (struct gcc_options *opts)
8864 {
8865   if (aarch64_tls_size == 0)
8866     aarch64_tls_size = 24;
8867
8868   switch (opts->x_aarch64_cmodel_var)
8869     {
8870     case AARCH64_CMODEL_TINY:
8871       /* Both the default and maximum TLS size allowed under tiny is 1M which
8872          needs two instructions to address, so we clamp the size to 24.  */
8873       if (aarch64_tls_size > 24)
8874         aarch64_tls_size = 24;
8875       break;
8876     case AARCH64_CMODEL_SMALL:
8877       /* The maximum TLS size allowed under small is 4G.  */
8878       if (aarch64_tls_size > 32)
8879         aarch64_tls_size = 32;
8880       break;
8881     case AARCH64_CMODEL_LARGE:
8882       /* The maximum TLS size allowed under large is 16E.
8883          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8884       if (aarch64_tls_size > 48)
8885         aarch64_tls_size = 48;
8886       break;
8887     default:
8888       gcc_unreachable ();
8889     }
8890
8891   return;
8892 }
8893
8894 /* Parse STRING looking for options in the format:
8895      string     :: option:string
8896      option     :: name=substring
8897      name       :: {a-z}
8898      substring  :: defined by option.  */
8899
8900 static void
8901 aarch64_parse_override_string (const char* input_string,
8902                                struct tune_params* tune)
8903 {
8904   const char separator = ':';
8905   size_t string_length = strlen (input_string) + 1;
8906   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8907   char *string = string_root;
8908   strncpy (string, input_string, string_length);
8909   string[string_length - 1] = '\0';
8910
8911   char* ntoken = string;
8912
8913   while ((ntoken = strchr (string, separator)))
8914     {
8915       size_t token_length = ntoken - string;
8916       /* Make this substring look like a string.  */
8917       *ntoken = '\0';
8918       aarch64_parse_one_override_token (string, token_length, tune);
8919       string = ++ntoken;
8920     }
8921
8922   /* One last option to parse.  */
8923   aarch64_parse_one_override_token (string, strlen (string), tune);
8924   free (string_root);
8925 }
8926
8927
8928 static void
8929 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8930 {
8931   /* The logic here is that if we are disabling all frame pointer generation
8932      then we do not need to disable leaf frame pointer generation as a
8933      separate operation.  But if we are *only* disabling leaf frame pointer
8934      generation then we set flag_omit_frame_pointer to true, but in
8935      aarch64_frame_pointer_required we return false only for leaf functions.
8936
8937      PR 70044: We have to be careful about being called multiple times for the
8938      same function.  Once we have decided to set flag_omit_frame_pointer just
8939      so that we can omit leaf frame pointers, we must then not interpret a
8940      second call as meaning that all frame pointer generation should be
8941      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8942      non-zero value.  */
8943   if (opts->x_flag_omit_frame_pointer == 2)
8944     opts->x_flag_omit_frame_pointer = 0;
8945
8946   if (opts->x_flag_omit_frame_pointer)
8947     opts->x_flag_omit_leaf_frame_pointer = false;
8948   else if (opts->x_flag_omit_leaf_frame_pointer)
8949     opts->x_flag_omit_frame_pointer = 2;
8950
8951   /* If not optimizing for size, set the default
8952      alignment to what the target wants.  */
8953   if (!opts->x_optimize_size)
8954     {
8955       if (opts->x_align_loops <= 0)
8956         opts->x_align_loops = aarch64_tune_params.loop_align;
8957       if (opts->x_align_jumps <= 0)
8958         opts->x_align_jumps = aarch64_tune_params.jump_align;
8959       if (opts->x_align_functions <= 0)
8960         opts->x_align_functions = aarch64_tune_params.function_align;
8961     }
8962
8963   /* We default to no pc-relative literal loads.  */
8964
8965   aarch64_pcrelative_literal_loads = false;
8966
8967   /* If -mpc-relative-literal-loads is set on the command line, this
8968      implies that the user asked for PC relative literal loads.  */
8969   if (opts->x_pcrelative_literal_loads == 1)
8970     aarch64_pcrelative_literal_loads = true;
8971
8972   /* This is PR70113. When building the Linux kernel with
8973      CONFIG_ARM64_ERRATUM_843419, support for relocations
8974      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8975      removed from the kernel to avoid loading objects with possibly
8976      offending sequences.  Without -mpc-relative-literal-loads we would
8977      generate such relocations, preventing the kernel build from
8978      succeeding.  */
8979   if (opts->x_pcrelative_literal_loads == 2
8980       && TARGET_FIX_ERR_A53_843419)
8981     aarch64_pcrelative_literal_loads = true;
8982
8983   /* In the tiny memory model it makes no sense to disallow PC relative
8984      literal pool loads.  */
8985   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8986       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8987     aarch64_pcrelative_literal_loads = true;
8988
8989   /* When enabling the lower precision Newton series for the square root, also
8990      enable it for the reciprocal square root, since the latter is an
8991      intermediary step for the former.  */
8992   if (flag_mlow_precision_sqrt)
8993     flag_mrecip_low_precision_sqrt = true;
8994 }
8995
8996 /* 'Unpack' up the internal tuning structs and update the options
8997     in OPTS.  The caller must have set up selected_tune and selected_arch
8998     as all the other target-specific codegen decisions are
8999     derived from them.  */
9000
9001 void
9002 aarch64_override_options_internal (struct gcc_options *opts)
9003 {
9004   aarch64_tune_flags = selected_tune->flags;
9005   aarch64_tune = selected_tune->sched_core;
9006   /* Make a copy of the tuning parameters attached to the core, which
9007      we may later overwrite.  */
9008   aarch64_tune_params = *(selected_tune->tune);
9009   aarch64_architecture_version = selected_arch->architecture_version;
9010
9011   if (opts->x_aarch64_override_tune_string)
9012     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9013                                   &aarch64_tune_params);
9014
9015   /* This target defaults to strict volatile bitfields.  */
9016   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9017     opts->x_flag_strict_volatile_bitfields = 1;
9018
9019   initialize_aarch64_code_model (opts);
9020   initialize_aarch64_tls_size (opts);
9021
9022   int queue_depth = 0;
9023   switch (aarch64_tune_params.autoprefetcher_model)
9024     {
9025       case tune_params::AUTOPREFETCHER_OFF:
9026         queue_depth = -1;
9027         break;
9028       case tune_params::AUTOPREFETCHER_WEAK:
9029         queue_depth = 0;
9030         break;
9031       case tune_params::AUTOPREFETCHER_STRONG:
9032         queue_depth = max_insn_queue_index + 1;
9033         break;
9034       default:
9035         gcc_unreachable ();
9036     }
9037
9038   /* We don't mind passing in global_options_set here as we don't use
9039      the *options_set structs anyway.  */
9040   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9041                          queue_depth,
9042                          opts->x_param_values,
9043                          global_options_set.x_param_values);
9044
9045   /* Set up parameters to be used in prefetching algorithm.  Do not
9046      override the defaults unless we are tuning for a core we have
9047      researched values for.  */
9048   if (aarch64_tune_params.prefetch->num_slots > 0)
9049     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9050                            aarch64_tune_params.prefetch->num_slots,
9051                            opts->x_param_values,
9052                            global_options_set.x_param_values);
9053   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9054     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9055                            aarch64_tune_params.prefetch->l1_cache_size,
9056                            opts->x_param_values,
9057                            global_options_set.x_param_values);
9058   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9059     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9060                            aarch64_tune_params.prefetch->l1_cache_line_size,
9061                            opts->x_param_values,
9062                            global_options_set.x_param_values);
9063   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9064     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9065                            aarch64_tune_params.prefetch->l2_cache_size,
9066                            opts->x_param_values,
9067                            global_options_set.x_param_values);
9068
9069   /* Enable sw prefetching at specified optimization level for
9070      CPUS that have prefetch.  Lower optimization level threshold by 1
9071      when profiling is enabled.  */
9072   if (opts->x_flag_prefetch_loop_arrays < 0
9073       && !opts->x_optimize_size
9074       && aarch64_tune_params.prefetch->default_opt_level >= 0
9075       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9076     opts->x_flag_prefetch_loop_arrays = 1;
9077
9078   aarch64_override_options_after_change_1 (opts);
9079 }
9080
9081 /* Print a hint with a suggestion for a core or architecture name that
9082    most closely resembles what the user passed in STR.  ARCH is true if
9083    the user is asking for an architecture name.  ARCH is false if the user
9084    is asking for a core name.  */
9085
9086 static void
9087 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9088 {
9089   auto_vec<const char *> candidates;
9090   const struct processor *entry = arch ? all_architectures : all_cores;
9091   for (; entry->name != NULL; entry++)
9092     candidates.safe_push (entry->name);
9093   char *s;
9094   const char *hint = candidates_list_and_hint (str, s, candidates);
9095   if (hint)
9096     inform (input_location, "valid arguments are: %s;"
9097                              " did you mean %qs?", s, hint);
9098   XDELETEVEC (s);
9099 }
9100
9101 /* Print a hint with a suggestion for a core name that most closely resembles
9102    what the user passed in STR.  */
9103
9104 inline static void
9105 aarch64_print_hint_for_core (const char *str)
9106 {
9107   aarch64_print_hint_for_core_or_arch (str, false);
9108 }
9109
9110 /* Print a hint with a suggestion for an architecture name that most closely
9111    resembles what the user passed in STR.  */
9112
9113 inline static void
9114 aarch64_print_hint_for_arch (const char *str)
9115 {
9116   aarch64_print_hint_for_core_or_arch (str, true);
9117 }
9118
9119 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9120    specified in STR and throw errors if appropriate.  Put the results if
9121    they are valid in RES and ISA_FLAGS.  Return whether the option is
9122    valid.  */
9123
9124 static bool
9125 aarch64_validate_mcpu (const char *str, const struct processor **res,
9126                        unsigned long *isa_flags)
9127 {
9128   enum aarch64_parse_opt_result parse_res
9129     = aarch64_parse_cpu (str, res, isa_flags);
9130
9131   if (parse_res == AARCH64_PARSE_OK)
9132     return true;
9133
9134   switch (parse_res)
9135     {
9136       case AARCH64_PARSE_MISSING_ARG:
9137         error ("missing cpu name in %<-mcpu=%s%>", str);
9138         break;
9139       case AARCH64_PARSE_INVALID_ARG:
9140         error ("unknown value %qs for -mcpu", str);
9141         aarch64_print_hint_for_core (str);
9142         break;
9143       case AARCH64_PARSE_INVALID_FEATURE:
9144         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9145         break;
9146       default:
9147         gcc_unreachable ();
9148     }
9149
9150   return false;
9151 }
9152
9153 /* Validate a command-line -march option.  Parse the arch and extensions
9154    (if any) specified in STR and throw errors if appropriate.  Put the
9155    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9156    option is valid.  */
9157
9158 static bool
9159 aarch64_validate_march (const char *str, const struct processor **res,
9160                          unsigned long *isa_flags)
9161 {
9162   enum aarch64_parse_opt_result parse_res
9163     = aarch64_parse_arch (str, res, isa_flags);
9164
9165   if (parse_res == AARCH64_PARSE_OK)
9166     return true;
9167
9168   switch (parse_res)
9169     {
9170       case AARCH64_PARSE_MISSING_ARG:
9171         error ("missing arch name in %<-march=%s%>", str);
9172         break;
9173       case AARCH64_PARSE_INVALID_ARG:
9174         error ("unknown value %qs for -march", str);
9175         aarch64_print_hint_for_arch (str);
9176         break;
9177       case AARCH64_PARSE_INVALID_FEATURE:
9178         error ("invalid feature modifier in %<-march=%s%>", str);
9179         break;
9180       default:
9181         gcc_unreachable ();
9182     }
9183
9184   return false;
9185 }
9186
9187 /* Validate a command-line -mtune option.  Parse the cpu
9188    specified in STR and throw errors if appropriate.  Put the
9189    result, if it is valid, in RES.  Return whether the option is
9190    valid.  */
9191
9192 static bool
9193 aarch64_validate_mtune (const char *str, const struct processor **res)
9194 {
9195   enum aarch64_parse_opt_result parse_res
9196     = aarch64_parse_tune (str, res);
9197
9198   if (parse_res == AARCH64_PARSE_OK)
9199     return true;
9200
9201   switch (parse_res)
9202     {
9203       case AARCH64_PARSE_MISSING_ARG:
9204         error ("missing cpu name in %<-mtune=%s%>", str);
9205         break;
9206       case AARCH64_PARSE_INVALID_ARG:
9207         error ("unknown value %qs for -mtune", str);
9208         aarch64_print_hint_for_core (str);
9209         break;
9210       default:
9211         gcc_unreachable ();
9212     }
9213   return false;
9214 }
9215
9216 /* Return the CPU corresponding to the enum CPU.
9217    If it doesn't specify a cpu, return the default.  */
9218
9219 static const struct processor *
9220 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9221 {
9222   if (cpu != aarch64_none)
9223     return &all_cores[cpu];
9224
9225   /* The & 0x3f is to extract the bottom 6 bits that encode the
9226      default cpu as selected by the --with-cpu GCC configure option
9227      in config.gcc.
9228      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9229      flags mechanism should be reworked to make it more sane.  */
9230   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9231 }
9232
9233 /* Return the architecture corresponding to the enum ARCH.
9234    If it doesn't specify a valid architecture, return the default.  */
9235
9236 static const struct processor *
9237 aarch64_get_arch (enum aarch64_arch arch)
9238 {
9239   if (arch != aarch64_no_arch)
9240     return &all_architectures[arch];
9241
9242   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9243
9244   return &all_architectures[cpu->arch];
9245 }
9246
9247 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9248    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9249    tuning structs.  In particular it must set selected_tune and
9250    aarch64_isa_flags that define the available ISA features and tuning
9251    decisions.  It must also set selected_arch as this will be used to
9252    output the .arch asm tags for each function.  */
9253
9254 static void
9255 aarch64_override_options (void)
9256 {
9257   unsigned long cpu_isa = 0;
9258   unsigned long arch_isa = 0;
9259   aarch64_isa_flags = 0;
9260
9261   bool valid_cpu = true;
9262   bool valid_tune = true;
9263   bool valid_arch = true;
9264
9265   selected_cpu = NULL;
9266   selected_arch = NULL;
9267   selected_tune = NULL;
9268
9269   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9270      If either of -march or -mtune is given, they override their
9271      respective component of -mcpu.  */
9272   if (aarch64_cpu_string)
9273     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9274                                         &cpu_isa);
9275
9276   if (aarch64_arch_string)
9277     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9278                                           &arch_isa);
9279
9280   if (aarch64_tune_string)
9281     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9282
9283   /* If the user did not specify a processor, choose the default
9284      one for them.  This will be the CPU set during configuration using
9285      --with-cpu, otherwise it is "generic".  */
9286   if (!selected_cpu)
9287     {
9288       if (selected_arch)
9289         {
9290           selected_cpu = &all_cores[selected_arch->ident];
9291           aarch64_isa_flags = arch_isa;
9292           explicit_arch = selected_arch->arch;
9293         }
9294       else
9295         {
9296           /* Get default configure-time CPU.  */
9297           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9298           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9299         }
9300
9301       if (selected_tune)
9302         explicit_tune_core = selected_tune->ident;
9303     }
9304   /* If both -mcpu and -march are specified check that they are architecturally
9305      compatible, warn if they're not and prefer the -march ISA flags.  */
9306   else if (selected_arch)
9307     {
9308       if (selected_arch->arch != selected_cpu->arch)
9309         {
9310           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9311                        all_architectures[selected_cpu->arch].name,
9312                        selected_arch->name);
9313         }
9314       aarch64_isa_flags = arch_isa;
9315       explicit_arch = selected_arch->arch;
9316       explicit_tune_core = selected_tune ? selected_tune->ident
9317                                           : selected_cpu->ident;
9318     }
9319   else
9320     {
9321       /* -mcpu but no -march.  */
9322       aarch64_isa_flags = cpu_isa;
9323       explicit_tune_core = selected_tune ? selected_tune->ident
9324                                           : selected_cpu->ident;
9325       gcc_assert (selected_cpu);
9326       selected_arch = &all_architectures[selected_cpu->arch];
9327       explicit_arch = selected_arch->arch;
9328     }
9329
9330   /* Set the arch as well as we will need it when outputing
9331      the .arch directive in assembly.  */
9332   if (!selected_arch)
9333     {
9334       gcc_assert (selected_cpu);
9335       selected_arch = &all_architectures[selected_cpu->arch];
9336     }
9337
9338   if (!selected_tune)
9339     selected_tune = selected_cpu;
9340
9341 #ifndef HAVE_AS_MABI_OPTION
9342   /* The compiler may have been configured with 2.23.* binutils, which does
9343      not have support for ILP32.  */
9344   if (TARGET_ILP32)
9345     error ("Assembler does not support -mabi=ilp32");
9346 #endif
9347
9348   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9349     sorry ("Return address signing is only supported for -mabi=lp64");
9350
9351   /* Make sure we properly set up the explicit options.  */
9352   if ((aarch64_cpu_string && valid_cpu)
9353        || (aarch64_tune_string && valid_tune))
9354     gcc_assert (explicit_tune_core != aarch64_none);
9355
9356   if ((aarch64_cpu_string && valid_cpu)
9357        || (aarch64_arch_string && valid_arch))
9358     gcc_assert (explicit_arch != aarch64_no_arch);
9359
9360   aarch64_override_options_internal (&global_options);
9361
9362   /* Save these options as the default ones in case we push and pop them later
9363      while processing functions with potential target attributes.  */
9364   target_option_default_node = target_option_current_node
9365       = build_target_option_node (&global_options);
9366 }
9367
9368 /* Implement targetm.override_options_after_change.  */
9369
9370 static void
9371 aarch64_override_options_after_change (void)
9372 {
9373   aarch64_override_options_after_change_1 (&global_options);
9374 }
9375
9376 static struct machine_function *
9377 aarch64_init_machine_status (void)
9378 {
9379   struct machine_function *machine;
9380   machine = ggc_cleared_alloc<machine_function> ();
9381   return machine;
9382 }
9383
9384 void
9385 aarch64_init_expanders (void)
9386 {
9387   init_machine_status = aarch64_init_machine_status;
9388 }
9389
9390 /* A checking mechanism for the implementation of the various code models.  */
9391 static void
9392 initialize_aarch64_code_model (struct gcc_options *opts)
9393 {
9394    if (opts->x_flag_pic)
9395      {
9396        switch (opts->x_aarch64_cmodel_var)
9397          {
9398          case AARCH64_CMODEL_TINY:
9399            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9400            break;
9401          case AARCH64_CMODEL_SMALL:
9402 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9403            aarch64_cmodel = (flag_pic == 2
9404                              ? AARCH64_CMODEL_SMALL_PIC
9405                              : AARCH64_CMODEL_SMALL_SPIC);
9406 #else
9407            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9408 #endif
9409            break;
9410          case AARCH64_CMODEL_LARGE:
9411            sorry ("code model %qs with -f%s", "large",
9412                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9413            break;
9414          default:
9415            gcc_unreachable ();
9416          }
9417      }
9418    else
9419      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9420 }
9421
9422 /* Implement TARGET_OPTION_SAVE.  */
9423
9424 static void
9425 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9426 {
9427   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9428 }
9429
9430 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9431    using the information saved in PTR.  */
9432
9433 static void
9434 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9435 {
9436   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9437   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9438   opts->x_explicit_arch = ptr->x_explicit_arch;
9439   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9440   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9441
9442   aarch64_override_options_internal (opts);
9443 }
9444
9445 /* Implement TARGET_OPTION_PRINT.  */
9446
9447 static void
9448 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9449 {
9450   const struct processor *cpu
9451     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9452   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9453   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9454   std::string extension
9455     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9456
9457   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9458   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9459            arch->name, extension.c_str ());
9460 }
9461
9462 static GTY(()) tree aarch64_previous_fndecl;
9463
9464 void
9465 aarch64_reset_previous_fndecl (void)
9466 {
9467   aarch64_previous_fndecl = NULL;
9468 }
9469
9470 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9471    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9472    make sure optab availability predicates are recomputed when necessary.  */
9473
9474 void
9475 aarch64_save_restore_target_globals (tree new_tree)
9476 {
9477   if (TREE_TARGET_GLOBALS (new_tree))
9478     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9479   else if (new_tree == target_option_default_node)
9480     restore_target_globals (&default_target_globals);
9481   else
9482     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9483 }
9484
9485 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9486    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9487    of the function, if such exists.  This function may be called multiple
9488    times on a single function so use aarch64_previous_fndecl to avoid
9489    setting up identical state.  */
9490
9491 static void
9492 aarch64_set_current_function (tree fndecl)
9493 {
9494   if (!fndecl || fndecl == aarch64_previous_fndecl)
9495     return;
9496
9497   tree old_tree = (aarch64_previous_fndecl
9498                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9499                    : NULL_TREE);
9500
9501   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9502
9503   /* If current function has no attributes but the previous one did,
9504      use the default node.  */
9505   if (!new_tree && old_tree)
9506     new_tree = target_option_default_node;
9507
9508   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9509      the default have been handled by aarch64_save_restore_target_globals from
9510      aarch64_pragma_target_parse.  */
9511   if (old_tree == new_tree)
9512     return;
9513
9514   aarch64_previous_fndecl = fndecl;
9515
9516   /* First set the target options.  */
9517   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9518
9519   aarch64_save_restore_target_globals (new_tree);
9520 }
9521
9522 /* Enum describing the various ways we can handle attributes.
9523    In many cases we can reuse the generic option handling machinery.  */
9524
9525 enum aarch64_attr_opt_type
9526 {
9527   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9528   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9529   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9530   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9531 };
9532
9533 /* All the information needed to handle a target attribute.
9534    NAME is the name of the attribute.
9535    ATTR_TYPE specifies the type of behavior of the attribute as described
9536    in the definition of enum aarch64_attr_opt_type.
9537    ALLOW_NEG is true if the attribute supports a "no-" form.
9538    HANDLER is the function that takes the attribute string and whether
9539    it is a pragma or attribute and handles the option.  It is needed only
9540    when the ATTR_TYPE is aarch64_attr_custom.
9541    OPT_NUM is the enum specifying the option that the attribute modifies.
9542    This is needed for attributes that mirror the behavior of a command-line
9543    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9544    aarch64_attr_enum.  */
9545
9546 struct aarch64_attribute_info
9547 {
9548   const char *name;
9549   enum aarch64_attr_opt_type attr_type;
9550   bool allow_neg;
9551   bool (*handler) (const char *, const char *);
9552   enum opt_code opt_num;
9553 };
9554
9555 /* Handle the ARCH_STR argument to the arch= target attribute.
9556    PRAGMA_OR_ATTR is used in potential error messages.  */
9557
9558 static bool
9559 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9560 {
9561   const struct processor *tmp_arch = NULL;
9562   enum aarch64_parse_opt_result parse_res
9563     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9564
9565   if (parse_res == AARCH64_PARSE_OK)
9566     {
9567       gcc_assert (tmp_arch);
9568       selected_arch = tmp_arch;
9569       explicit_arch = selected_arch->arch;
9570       return true;
9571     }
9572
9573   switch (parse_res)
9574     {
9575       case AARCH64_PARSE_MISSING_ARG:
9576         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9577         break;
9578       case AARCH64_PARSE_INVALID_ARG:
9579         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9580         aarch64_print_hint_for_arch (str);
9581         break;
9582       case AARCH64_PARSE_INVALID_FEATURE:
9583         error ("invalid feature modifier %qs for 'arch' target %s",
9584                str, pragma_or_attr);
9585         break;
9586       default:
9587         gcc_unreachable ();
9588     }
9589
9590   return false;
9591 }
9592
9593 /* Handle the argument CPU_STR to the cpu= target attribute.
9594    PRAGMA_OR_ATTR is used in potential error messages.  */
9595
9596 static bool
9597 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9598 {
9599   const struct processor *tmp_cpu = NULL;
9600   enum aarch64_parse_opt_result parse_res
9601     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9602
9603   if (parse_res == AARCH64_PARSE_OK)
9604     {
9605       gcc_assert (tmp_cpu);
9606       selected_tune = tmp_cpu;
9607       explicit_tune_core = selected_tune->ident;
9608
9609       selected_arch = &all_architectures[tmp_cpu->arch];
9610       explicit_arch = selected_arch->arch;
9611       return true;
9612     }
9613
9614   switch (parse_res)
9615     {
9616       case AARCH64_PARSE_MISSING_ARG:
9617         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9618         break;
9619       case AARCH64_PARSE_INVALID_ARG:
9620         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9621         aarch64_print_hint_for_core (str);
9622         break;
9623       case AARCH64_PARSE_INVALID_FEATURE:
9624         error ("invalid feature modifier %qs for 'cpu' target %s",
9625                str, pragma_or_attr);
9626         break;
9627       default:
9628         gcc_unreachable ();
9629     }
9630
9631   return false;
9632 }
9633
9634 /* Handle the argument STR to the tune= target attribute.
9635    PRAGMA_OR_ATTR is used in potential error messages.  */
9636
9637 static bool
9638 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9639 {
9640   const struct processor *tmp_tune = NULL;
9641   enum aarch64_parse_opt_result parse_res
9642     = aarch64_parse_tune (str, &tmp_tune);
9643
9644   if (parse_res == AARCH64_PARSE_OK)
9645     {
9646       gcc_assert (tmp_tune);
9647       selected_tune = tmp_tune;
9648       explicit_tune_core = selected_tune->ident;
9649       return true;
9650     }
9651
9652   switch (parse_res)
9653     {
9654       case AARCH64_PARSE_INVALID_ARG:
9655         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9656         aarch64_print_hint_for_core (str);
9657         break;
9658       default:
9659         gcc_unreachable ();
9660     }
9661
9662   return false;
9663 }
9664
9665 /* Parse an architecture extensions target attribute string specified in STR.
9666    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9667    if successful.  Update aarch64_isa_flags to reflect the ISA features
9668    modified.
9669    PRAGMA_OR_ATTR is used in potential error messages.  */
9670
9671 static bool
9672 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9673 {
9674   enum aarch64_parse_opt_result parse_res;
9675   unsigned long isa_flags = aarch64_isa_flags;
9676
9677   /* We allow "+nothing" in the beginning to clear out all architectural
9678      features if the user wants to handpick specific features.  */
9679   if (strncmp ("+nothing", str, 8) == 0)
9680     {
9681       isa_flags = 0;
9682       str += 8;
9683     }
9684
9685   parse_res = aarch64_parse_extension (str, &isa_flags);
9686
9687   if (parse_res == AARCH64_PARSE_OK)
9688     {
9689       aarch64_isa_flags = isa_flags;
9690       return true;
9691     }
9692
9693   switch (parse_res)
9694     {
9695       case AARCH64_PARSE_MISSING_ARG:
9696         error ("missing feature modifier in target %s %qs",
9697                pragma_or_attr, str);
9698         break;
9699
9700       case AARCH64_PARSE_INVALID_FEATURE:
9701         error ("invalid feature modifier in target %s %qs",
9702                pragma_or_attr, str);
9703         break;
9704
9705       default:
9706         gcc_unreachable ();
9707     }
9708
9709  return false;
9710 }
9711
9712 /* The target attributes that we support.  On top of these we also support just
9713    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9714    handled explicitly in aarch64_process_one_target_attr.  */
9715
9716 static const struct aarch64_attribute_info aarch64_attributes[] =
9717 {
9718   { "general-regs-only", aarch64_attr_mask, false, NULL,
9719      OPT_mgeneral_regs_only },
9720   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9721      OPT_mfix_cortex_a53_835769 },
9722   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9723      OPT_mfix_cortex_a53_843419 },
9724   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9725   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9726   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9727      OPT_momit_leaf_frame_pointer },
9728   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9729   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9730      OPT_march_ },
9731   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9732   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9733      OPT_mtune_ },
9734   { "sign-return-address", aarch64_attr_enum, false, NULL,
9735      OPT_msign_return_address_ },
9736   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9737 };
9738
9739 /* Parse ARG_STR which contains the definition of one target attribute.
9740    Show appropriate errors if any or return true if the attribute is valid.
9741    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9742    we're processing a target attribute or pragma.  */
9743
9744 static bool
9745 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9746 {
9747   bool invert = false;
9748
9749   size_t len = strlen (arg_str);
9750
9751   if (len == 0)
9752     {
9753       error ("malformed target %s", pragma_or_attr);
9754       return false;
9755     }
9756
9757   char *str_to_check = (char *) alloca (len + 1);
9758   strcpy (str_to_check, arg_str);
9759
9760   /* Skip leading whitespace.  */
9761   while (*str_to_check == ' ' || *str_to_check == '\t')
9762     str_to_check++;
9763
9764   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9765      It is easier to detect and handle it explicitly here rather than going
9766      through the machinery for the rest of the target attributes in this
9767      function.  */
9768   if (*str_to_check == '+')
9769     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9770
9771   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9772     {
9773       invert = true;
9774       str_to_check += 3;
9775     }
9776   char *arg = strchr (str_to_check, '=');
9777
9778   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9779      and point ARG to "foo".  */
9780   if (arg)
9781     {
9782       *arg = '\0';
9783       arg++;
9784     }
9785   const struct aarch64_attribute_info *p_attr;
9786   bool found = false;
9787   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9788     {
9789       /* If the names don't match up, or the user has given an argument
9790          to an attribute that doesn't accept one, or didn't give an argument
9791          to an attribute that expects one, fail to match.  */
9792       if (strcmp (str_to_check, p_attr->name) != 0)
9793         continue;
9794
9795       found = true;
9796       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9797                               || p_attr->attr_type == aarch64_attr_enum;
9798
9799       if (attr_need_arg_p ^ (arg != NULL))
9800         {
9801           error ("target %s %qs does not accept an argument",
9802                   pragma_or_attr, str_to_check);
9803           return false;
9804         }
9805
9806       /* If the name matches but the attribute does not allow "no-" versions
9807          then we can't match.  */
9808       if (invert && !p_attr->allow_neg)
9809         {
9810           error ("target %s %qs does not allow a negated form",
9811                   pragma_or_attr, str_to_check);
9812           return false;
9813         }
9814
9815       switch (p_attr->attr_type)
9816         {
9817         /* Has a custom handler registered.
9818            For example, cpu=, arch=, tune=.  */
9819           case aarch64_attr_custom:
9820             gcc_assert (p_attr->handler);
9821             if (!p_attr->handler (arg, pragma_or_attr))
9822               return false;
9823             break;
9824
9825           /* Either set or unset a boolean option.  */
9826           case aarch64_attr_bool:
9827             {
9828               struct cl_decoded_option decoded;
9829
9830               generate_option (p_attr->opt_num, NULL, !invert,
9831                                CL_TARGET, &decoded);
9832               aarch64_handle_option (&global_options, &global_options_set,
9833                                       &decoded, input_location);
9834               break;
9835             }
9836           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9837              should know what mask to apply given the option number.  */
9838           case aarch64_attr_mask:
9839             {
9840               struct cl_decoded_option decoded;
9841               /* We only need to specify the option number.
9842                  aarch64_handle_option will know which mask to apply.  */
9843               decoded.opt_index = p_attr->opt_num;
9844               decoded.value = !invert;
9845               aarch64_handle_option (&global_options, &global_options_set,
9846                                       &decoded, input_location);
9847               break;
9848             }
9849           /* Use the option setting machinery to set an option to an enum.  */
9850           case aarch64_attr_enum:
9851             {
9852               gcc_assert (arg);
9853               bool valid;
9854               int value;
9855               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9856                                               &value, CL_TARGET);
9857               if (valid)
9858                 {
9859                   set_option (&global_options, NULL, p_attr->opt_num, value,
9860                               NULL, DK_UNSPECIFIED, input_location,
9861                               global_dc);
9862                 }
9863               else
9864                 {
9865                   error ("target %s %s=%s is not valid",
9866                          pragma_or_attr, str_to_check, arg);
9867                 }
9868               break;
9869             }
9870           default:
9871             gcc_unreachable ();
9872         }
9873     }
9874
9875   /* If we reached here we either have found an attribute and validated
9876      it or didn't match any.  If we matched an attribute but its arguments
9877      were malformed we will have returned false already.  */
9878   return found;
9879 }
9880
9881 /* Count how many times the character C appears in
9882    NULL-terminated string STR.  */
9883
9884 static unsigned int
9885 num_occurences_in_str (char c, char *str)
9886 {
9887   unsigned int res = 0;
9888   while (*str != '\0')
9889     {
9890       if (*str == c)
9891         res++;
9892
9893       str++;
9894     }
9895
9896   return res;
9897 }
9898
9899 /* Parse the tree in ARGS that contains the target attribute information
9900    and update the global target options space.  PRAGMA_OR_ATTR is a string
9901    to be used in error messages, specifying whether this is processing
9902    a target attribute or a target pragma.  */
9903
9904 bool
9905 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9906 {
9907   if (TREE_CODE (args) == TREE_LIST)
9908     {
9909       do
9910         {
9911           tree head = TREE_VALUE (args);
9912           if (head)
9913             {
9914               if (!aarch64_process_target_attr (head, pragma_or_attr))
9915                 return false;
9916             }
9917           args = TREE_CHAIN (args);
9918         } while (args);
9919
9920       return true;
9921     }
9922
9923   if (TREE_CODE (args) != STRING_CST)
9924     {
9925       error ("attribute %<target%> argument not a string");
9926       return false;
9927     }
9928
9929   size_t len = strlen (TREE_STRING_POINTER (args));
9930   char *str_to_check = (char *) alloca (len + 1);
9931   strcpy (str_to_check, TREE_STRING_POINTER (args));
9932
9933   if (len == 0)
9934     {
9935       error ("malformed target %s value", pragma_or_attr);
9936       return false;
9937     }
9938
9939   /* Used to catch empty spaces between commas i.e.
9940      attribute ((target ("attr1,,attr2"))).  */
9941   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9942
9943   /* Handle multiple target attributes separated by ','.  */
9944   char *token = strtok (str_to_check, ",");
9945
9946   unsigned int num_attrs = 0;
9947   while (token)
9948     {
9949       num_attrs++;
9950       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9951         {
9952           error ("target %s %qs is invalid", pragma_or_attr, token);
9953           return false;
9954         }
9955
9956       token = strtok (NULL, ",");
9957     }
9958
9959   if (num_attrs != num_commas + 1)
9960     {
9961       error ("malformed target %s list %qs",
9962               pragma_or_attr, TREE_STRING_POINTER (args));
9963       return false;
9964     }
9965
9966   return true;
9967 }
9968
9969 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9970    process attribute ((target ("..."))).  */
9971
9972 static bool
9973 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9974 {
9975   struct cl_target_option cur_target;
9976   bool ret;
9977   tree old_optimize;
9978   tree new_target, new_optimize;
9979   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9980
9981   /* If what we're processing is the current pragma string then the
9982      target option node is already stored in target_option_current_node
9983      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9984      having to re-parse the string.  This is especially useful to keep
9985      arm_neon.h compile times down since that header contains a lot
9986      of intrinsics enclosed in pragmas.  */
9987   if (!existing_target && args == current_target_pragma)
9988     {
9989       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9990       return true;
9991     }
9992   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9993
9994   old_optimize = build_optimization_node (&global_options);
9995   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9996
9997   /* If the function changed the optimization levels as well as setting
9998      target options, start with the optimizations specified.  */
9999   if (func_optimize && func_optimize != old_optimize)
10000     cl_optimization_restore (&global_options,
10001                              TREE_OPTIMIZATION (func_optimize));
10002
10003   /* Save the current target options to restore at the end.  */
10004   cl_target_option_save (&cur_target, &global_options);
10005
10006   /* If fndecl already has some target attributes applied to it, unpack
10007      them so that we add this attribute on top of them, rather than
10008      overwriting them.  */
10009   if (existing_target)
10010     {
10011       struct cl_target_option *existing_options
10012         = TREE_TARGET_OPTION (existing_target);
10013
10014       if (existing_options)
10015         cl_target_option_restore (&global_options, existing_options);
10016     }
10017   else
10018     cl_target_option_restore (&global_options,
10019                         TREE_TARGET_OPTION (target_option_current_node));
10020
10021
10022   ret = aarch64_process_target_attr (args, "attribute");
10023
10024   /* Set up any additional state.  */
10025   if (ret)
10026     {
10027       aarch64_override_options_internal (&global_options);
10028       /* Initialize SIMD builtins if we haven't already.
10029          Set current_target_pragma to NULL for the duration so that
10030          the builtin initialization code doesn't try to tag the functions
10031          being built with the attributes specified by any current pragma, thus
10032          going into an infinite recursion.  */
10033       if (TARGET_SIMD)
10034         {
10035           tree saved_current_target_pragma = current_target_pragma;
10036           current_target_pragma = NULL;
10037           aarch64_init_simd_builtins ();
10038           current_target_pragma = saved_current_target_pragma;
10039         }
10040       new_target = build_target_option_node (&global_options);
10041     }
10042   else
10043     new_target = NULL;
10044
10045   new_optimize = build_optimization_node (&global_options);
10046
10047   if (fndecl && ret)
10048     {
10049       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10050
10051       if (old_optimize != new_optimize)
10052         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10053     }
10054
10055   cl_target_option_restore (&global_options, &cur_target);
10056
10057   if (old_optimize != new_optimize)
10058     cl_optimization_restore (&global_options,
10059                              TREE_OPTIMIZATION (old_optimize));
10060   return ret;
10061 }
10062
10063 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10064    tri-bool options (yes, no, don't care) and the default value is
10065    DEF, determine whether to reject inlining.  */
10066
10067 static bool
10068 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10069                                      int dont_care, int def)
10070 {
10071   /* If the callee doesn't care, always allow inlining.  */
10072   if (callee == dont_care)
10073     return true;
10074
10075   /* If the caller doesn't care, always allow inlining.  */
10076   if (caller == dont_care)
10077     return true;
10078
10079   /* Otherwise, allow inlining if either the callee and caller values
10080      agree, or if the callee is using the default value.  */
10081   return (callee == caller || callee == def);
10082 }
10083
10084 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10085    to inline CALLEE into CALLER based on target-specific info.
10086    Make sure that the caller and callee have compatible architectural
10087    features.  Then go through the other possible target attributes
10088    and see if they can block inlining.  Try not to reject always_inline
10089    callees unless they are incompatible architecturally.  */
10090
10091 static bool
10092 aarch64_can_inline_p (tree caller, tree callee)
10093 {
10094   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10095   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10096
10097   /* If callee has no option attributes, then it is ok to inline.  */
10098   if (!callee_tree)
10099     return true;
10100
10101   struct cl_target_option *caller_opts
10102         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10103                                            : target_option_default_node);
10104
10105   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10106
10107
10108   /* Callee's ISA flags should be a subset of the caller's.  */
10109   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10110        != callee_opts->x_aarch64_isa_flags)
10111     return false;
10112
10113   /* Allow non-strict aligned functions inlining into strict
10114      aligned ones.  */
10115   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10116        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10117       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10118            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10119     return false;
10120
10121   bool always_inline = lookup_attribute ("always_inline",
10122                                           DECL_ATTRIBUTES (callee));
10123
10124   /* If the architectural features match up and the callee is always_inline
10125      then the other attributes don't matter.  */
10126   if (always_inline)
10127     return true;
10128
10129   if (caller_opts->x_aarch64_cmodel_var
10130       != callee_opts->x_aarch64_cmodel_var)
10131     return false;
10132
10133   if (caller_opts->x_aarch64_tls_dialect
10134       != callee_opts->x_aarch64_tls_dialect)
10135     return false;
10136
10137   /* Honour explicit requests to workaround errata.  */
10138   if (!aarch64_tribools_ok_for_inlining_p (
10139           caller_opts->x_aarch64_fix_a53_err835769,
10140           callee_opts->x_aarch64_fix_a53_err835769,
10141           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10142     return false;
10143
10144   if (!aarch64_tribools_ok_for_inlining_p (
10145           caller_opts->x_aarch64_fix_a53_err843419,
10146           callee_opts->x_aarch64_fix_a53_err843419,
10147           2, TARGET_FIX_ERR_A53_843419))
10148     return false;
10149
10150   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10151      caller and calle and they don't match up, reject inlining.  */
10152   if (!aarch64_tribools_ok_for_inlining_p (
10153           caller_opts->x_flag_omit_leaf_frame_pointer,
10154           callee_opts->x_flag_omit_leaf_frame_pointer,
10155           2, 1))
10156     return false;
10157
10158   /* If the callee has specific tuning overrides, respect them.  */
10159   if (callee_opts->x_aarch64_override_tune_string != NULL
10160       && caller_opts->x_aarch64_override_tune_string == NULL)
10161     return false;
10162
10163   /* If the user specified tuning override strings for the
10164      caller and callee and they don't match up, reject inlining.
10165      We just do a string compare here, we don't analyze the meaning
10166      of the string, as it would be too costly for little gain.  */
10167   if (callee_opts->x_aarch64_override_tune_string
10168       && caller_opts->x_aarch64_override_tune_string
10169       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10170                   caller_opts->x_aarch64_override_tune_string) != 0))
10171     return false;
10172
10173   return true;
10174 }
10175
10176 /* Return true if SYMBOL_REF X binds locally.  */
10177
10178 static bool
10179 aarch64_symbol_binds_local_p (const_rtx x)
10180 {
10181   return (SYMBOL_REF_DECL (x)
10182           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10183           : SYMBOL_REF_LOCAL_P (x));
10184 }
10185
10186 /* Return true if SYMBOL_REF X is thread local */
10187 static bool
10188 aarch64_tls_symbol_p (rtx x)
10189 {
10190   if (! TARGET_HAVE_TLS)
10191     return false;
10192
10193   if (GET_CODE (x) != SYMBOL_REF)
10194     return false;
10195
10196   return SYMBOL_REF_TLS_MODEL (x) != 0;
10197 }
10198
10199 /* Classify a TLS symbol into one of the TLS kinds.  */
10200 enum aarch64_symbol_type
10201 aarch64_classify_tls_symbol (rtx x)
10202 {
10203   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10204
10205   switch (tls_kind)
10206     {
10207     case TLS_MODEL_GLOBAL_DYNAMIC:
10208     case TLS_MODEL_LOCAL_DYNAMIC:
10209       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10210
10211     case TLS_MODEL_INITIAL_EXEC:
10212       switch (aarch64_cmodel)
10213         {
10214         case AARCH64_CMODEL_TINY:
10215         case AARCH64_CMODEL_TINY_PIC:
10216           return SYMBOL_TINY_TLSIE;
10217         default:
10218           return SYMBOL_SMALL_TLSIE;
10219         }
10220
10221     case TLS_MODEL_LOCAL_EXEC:
10222       if (aarch64_tls_size == 12)
10223         return SYMBOL_TLSLE12;
10224       else if (aarch64_tls_size == 24)
10225         return SYMBOL_TLSLE24;
10226       else if (aarch64_tls_size == 32)
10227         return SYMBOL_TLSLE32;
10228       else if (aarch64_tls_size == 48)
10229         return SYMBOL_TLSLE48;
10230       else
10231         gcc_unreachable ();
10232
10233     case TLS_MODEL_EMULATED:
10234     case TLS_MODEL_NONE:
10235       return SYMBOL_FORCE_TO_MEM;
10236
10237     default:
10238       gcc_unreachable ();
10239     }
10240 }
10241
10242 /* Return the method that should be used to access SYMBOL_REF or
10243    LABEL_REF X.  */
10244
10245 enum aarch64_symbol_type
10246 aarch64_classify_symbol (rtx x, rtx offset)
10247 {
10248   if (GET_CODE (x) == LABEL_REF)
10249     {
10250       switch (aarch64_cmodel)
10251         {
10252         case AARCH64_CMODEL_LARGE:
10253           return SYMBOL_FORCE_TO_MEM;
10254
10255         case AARCH64_CMODEL_TINY_PIC:
10256         case AARCH64_CMODEL_TINY:
10257           return SYMBOL_TINY_ABSOLUTE;
10258
10259         case AARCH64_CMODEL_SMALL_SPIC:
10260         case AARCH64_CMODEL_SMALL_PIC:
10261         case AARCH64_CMODEL_SMALL:
10262           return SYMBOL_SMALL_ABSOLUTE;
10263
10264         default:
10265           gcc_unreachable ();
10266         }
10267     }
10268
10269   if (GET_CODE (x) == SYMBOL_REF)
10270     {
10271       if (aarch64_tls_symbol_p (x))
10272         return aarch64_classify_tls_symbol (x);
10273
10274       switch (aarch64_cmodel)
10275         {
10276         case AARCH64_CMODEL_TINY:
10277           /* When we retrieve symbol + offset address, we have to make sure
10278              the offset does not cause overflow of the final address.  But
10279              we have no way of knowing the address of symbol at compile time
10280              so we can't accurately say if the distance between the PC and
10281              symbol + offset is outside the addressible range of +/-1M in the
10282              TINY code model.  So we rely on images not being greater than
10283              1M and cap the offset at 1M and anything beyond 1M will have to
10284              be loaded using an alternative mechanism.  Furthermore if the
10285              symbol is a weak reference to something that isn't known to
10286              resolve to a symbol in this module, then force to memory.  */
10287           if ((SYMBOL_REF_WEAK (x)
10288                && !aarch64_symbol_binds_local_p (x))
10289               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10290             return SYMBOL_FORCE_TO_MEM;
10291           return SYMBOL_TINY_ABSOLUTE;
10292
10293         case AARCH64_CMODEL_SMALL:
10294           /* Same reasoning as the tiny code model, but the offset cap here is
10295              4G.  */
10296           if ((SYMBOL_REF_WEAK (x)
10297                && !aarch64_symbol_binds_local_p (x))
10298               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10299                             HOST_WIDE_INT_C (4294967264)))
10300             return SYMBOL_FORCE_TO_MEM;
10301           return SYMBOL_SMALL_ABSOLUTE;
10302
10303         case AARCH64_CMODEL_TINY_PIC:
10304           if (!aarch64_symbol_binds_local_p (x))
10305             return SYMBOL_TINY_GOT;
10306           return SYMBOL_TINY_ABSOLUTE;
10307
10308         case AARCH64_CMODEL_SMALL_SPIC:
10309         case AARCH64_CMODEL_SMALL_PIC:
10310           if (!aarch64_symbol_binds_local_p (x))
10311             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10312                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10313           return SYMBOL_SMALL_ABSOLUTE;
10314
10315         case AARCH64_CMODEL_LARGE:
10316           /* This is alright even in PIC code as the constant
10317              pool reference is always PC relative and within
10318              the same translation unit.  */
10319           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10320             return SYMBOL_SMALL_ABSOLUTE;
10321           else
10322             return SYMBOL_FORCE_TO_MEM;
10323
10324         default:
10325           gcc_unreachable ();
10326         }
10327     }
10328
10329   /* By default push everything into the constant pool.  */
10330   return SYMBOL_FORCE_TO_MEM;
10331 }
10332
10333 bool
10334 aarch64_constant_address_p (rtx x)
10335 {
10336   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10337 }
10338
10339 bool
10340 aarch64_legitimate_pic_operand_p (rtx x)
10341 {
10342   if (GET_CODE (x) == SYMBOL_REF
10343       || (GET_CODE (x) == CONST
10344           && GET_CODE (XEXP (x, 0)) == PLUS
10345           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10346      return false;
10347
10348   return true;
10349 }
10350
10351 /* Return true if X holds either a quarter-precision or
10352      floating-point +0.0 constant.  */
10353 static bool
10354 aarch64_valid_floating_const (rtx x)
10355 {
10356   if (!CONST_DOUBLE_P (x))
10357     return false;
10358
10359   /* This call determines which constants can be used in mov<mode>
10360      as integer moves instead of constant loads.  */
10361   if (aarch64_float_const_rtx_p (x))
10362     return true;
10363
10364   return aarch64_float_const_representable_p (x);
10365 }
10366
10367 static bool
10368 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10369 {
10370   /* Do not allow vector struct mode constants.  We could support
10371      0 and -1 easily, but they need support in aarch64-simd.md.  */
10372   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10373     return false;
10374
10375   /* For these cases we never want to use a literal load.
10376      As such we have to prevent the compiler from forcing these
10377      to memory.  */
10378   if ((GET_CODE (x) == CONST_VECTOR
10379        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10380       || CONST_INT_P (x)
10381       || aarch64_valid_floating_const (x)
10382       || aarch64_can_const_movi_rtx_p (x, mode)
10383       || aarch64_float_const_rtx_p (x))
10384         return !targetm.cannot_force_const_mem (mode, x);
10385
10386   if (GET_CODE (x) == HIGH
10387       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10388     return true;
10389
10390   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10391      so spilling them is better than rematerialization.  */
10392   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10393     return true;
10394
10395   return aarch64_constant_address_p (x);
10396 }
10397
10398 rtx
10399 aarch64_load_tp (rtx target)
10400 {
10401   if (!target
10402       || GET_MODE (target) != Pmode
10403       || !register_operand (target, Pmode))
10404     target = gen_reg_rtx (Pmode);
10405
10406   /* Can return in any reg.  */
10407   emit_insn (gen_aarch64_load_tp_hard (target));
10408   return target;
10409 }
10410
10411 /* On AAPCS systems, this is the "struct __va_list".  */
10412 static GTY(()) tree va_list_type;
10413
10414 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10415    Return the type to use as __builtin_va_list.
10416
10417    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10418
10419    struct __va_list
10420    {
10421      void *__stack;
10422      void *__gr_top;
10423      void *__vr_top;
10424      int   __gr_offs;
10425      int   __vr_offs;
10426    };  */
10427
10428 static tree
10429 aarch64_build_builtin_va_list (void)
10430 {
10431   tree va_list_name;
10432   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10433
10434   /* Create the type.  */
10435   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10436   /* Give it the required name.  */
10437   va_list_name = build_decl (BUILTINS_LOCATION,
10438                              TYPE_DECL,
10439                              get_identifier ("__va_list"),
10440                              va_list_type);
10441   DECL_ARTIFICIAL (va_list_name) = 1;
10442   TYPE_NAME (va_list_type) = va_list_name;
10443   TYPE_STUB_DECL (va_list_type) = va_list_name;
10444
10445   /* Create the fields.  */
10446   f_stack = build_decl (BUILTINS_LOCATION,
10447                         FIELD_DECL, get_identifier ("__stack"),
10448                         ptr_type_node);
10449   f_grtop = build_decl (BUILTINS_LOCATION,
10450                         FIELD_DECL, get_identifier ("__gr_top"),
10451                         ptr_type_node);
10452   f_vrtop = build_decl (BUILTINS_LOCATION,
10453                         FIELD_DECL, get_identifier ("__vr_top"),
10454                         ptr_type_node);
10455   f_groff = build_decl (BUILTINS_LOCATION,
10456                         FIELD_DECL, get_identifier ("__gr_offs"),
10457                         integer_type_node);
10458   f_vroff = build_decl (BUILTINS_LOCATION,
10459                         FIELD_DECL, get_identifier ("__vr_offs"),
10460                         integer_type_node);
10461
10462   /* Tell tree-stdarg pass about our internal offset fields.
10463      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10464      purpose to identify whether the code is updating va_list internal
10465      offset fields through irregular way.  */
10466   va_list_gpr_counter_field = f_groff;
10467   va_list_fpr_counter_field = f_vroff;
10468
10469   DECL_ARTIFICIAL (f_stack) = 1;
10470   DECL_ARTIFICIAL (f_grtop) = 1;
10471   DECL_ARTIFICIAL (f_vrtop) = 1;
10472   DECL_ARTIFICIAL (f_groff) = 1;
10473   DECL_ARTIFICIAL (f_vroff) = 1;
10474
10475   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10476   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10477   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10478   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10479   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10480
10481   TYPE_FIELDS (va_list_type) = f_stack;
10482   DECL_CHAIN (f_stack) = f_grtop;
10483   DECL_CHAIN (f_grtop) = f_vrtop;
10484   DECL_CHAIN (f_vrtop) = f_groff;
10485   DECL_CHAIN (f_groff) = f_vroff;
10486
10487   /* Compute its layout.  */
10488   layout_type (va_list_type);
10489
10490   return va_list_type;
10491 }
10492
10493 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10494 static void
10495 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10496 {
10497   const CUMULATIVE_ARGS *cum;
10498   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10499   tree stack, grtop, vrtop, groff, vroff;
10500   tree t;
10501   int gr_save_area_size = cfun->va_list_gpr_size;
10502   int vr_save_area_size = cfun->va_list_fpr_size;
10503   int vr_offset;
10504
10505   cum = &crtl->args.info;
10506   if (cfun->va_list_gpr_size)
10507     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10508                              cfun->va_list_gpr_size);
10509   if (cfun->va_list_fpr_size)
10510     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10511                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10512
10513   if (!TARGET_FLOAT)
10514     {
10515       gcc_assert (cum->aapcs_nvrn == 0);
10516       vr_save_area_size = 0;
10517     }
10518
10519   f_stack = TYPE_FIELDS (va_list_type_node);
10520   f_grtop = DECL_CHAIN (f_stack);
10521   f_vrtop = DECL_CHAIN (f_grtop);
10522   f_groff = DECL_CHAIN (f_vrtop);
10523   f_vroff = DECL_CHAIN (f_groff);
10524
10525   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10526                   NULL_TREE);
10527   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10528                   NULL_TREE);
10529   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10530                   NULL_TREE);
10531   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10532                   NULL_TREE);
10533   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10534                   NULL_TREE);
10535
10536   /* Emit code to initialize STACK, which points to the next varargs stack
10537      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10538      by named arguments.  STACK is 8-byte aligned.  */
10539   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10540   if (cum->aapcs_stack_size > 0)
10541     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10542   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10543   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10544
10545   /* Emit code to initialize GRTOP, the top of the GR save area.
10546      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10547   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10548   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10549   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10550
10551   /* Emit code to initialize VRTOP, the top of the VR save area.
10552      This address is gr_save_area_bytes below GRTOP, rounded
10553      down to the next 16-byte boundary.  */
10554   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10555   vr_offset = ROUND_UP (gr_save_area_size,
10556                         STACK_BOUNDARY / BITS_PER_UNIT);
10557
10558   if (vr_offset)
10559     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10560   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10561   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10562
10563   /* Emit code to initialize GROFF, the offset from GRTOP of the
10564      next GPR argument.  */
10565   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10566               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10567   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10568
10569   /* Likewise emit code to initialize VROFF, the offset from FTOP
10570      of the next VR argument.  */
10571   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10572               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10573   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10574 }
10575
10576 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10577
10578 static tree
10579 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10580                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10581 {
10582   tree addr;
10583   bool indirect_p;
10584   bool is_ha;           /* is HFA or HVA.  */
10585   bool dw_align;        /* double-word align.  */
10586   machine_mode ag_mode = VOIDmode;
10587   int nregs;
10588   machine_mode mode;
10589
10590   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10591   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10592   HOST_WIDE_INT size, rsize, adjust, align;
10593   tree t, u, cond1, cond2;
10594
10595   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10596   if (indirect_p)
10597     type = build_pointer_type (type);
10598
10599   mode = TYPE_MODE (type);
10600
10601   f_stack = TYPE_FIELDS (va_list_type_node);
10602   f_grtop = DECL_CHAIN (f_stack);
10603   f_vrtop = DECL_CHAIN (f_grtop);
10604   f_groff = DECL_CHAIN (f_vrtop);
10605   f_vroff = DECL_CHAIN (f_groff);
10606
10607   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10608                   f_stack, NULL_TREE);
10609   size = int_size_in_bytes (type);
10610   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10611
10612   dw_align = false;
10613   adjust = 0;
10614   if (aarch64_vfp_is_call_or_return_candidate (mode,
10615                                                type,
10616                                                &ag_mode,
10617                                                &nregs,
10618                                                &is_ha))
10619     {
10620       /* TYPE passed in fp/simd registers.  */
10621       if (!TARGET_FLOAT)
10622         aarch64_err_no_fpadvsimd (mode, "varargs");
10623
10624       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10625                       unshare_expr (valist), f_vrtop, NULL_TREE);
10626       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10627                       unshare_expr (valist), f_vroff, NULL_TREE);
10628
10629       rsize = nregs * UNITS_PER_VREG;
10630
10631       if (is_ha)
10632         {
10633           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10634             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10635         }
10636       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10637                && size < UNITS_PER_VREG)
10638         {
10639           adjust = UNITS_PER_VREG - size;
10640         }
10641     }
10642   else
10643     {
10644       /* TYPE passed in general registers.  */
10645       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10646                       unshare_expr (valist), f_grtop, NULL_TREE);
10647       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10648                       unshare_expr (valist), f_groff, NULL_TREE);
10649       rsize = ROUND_UP (size, UNITS_PER_WORD);
10650       nregs = rsize / UNITS_PER_WORD;
10651
10652       if (align > 8)
10653         dw_align = true;
10654
10655       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10656           && size < UNITS_PER_WORD)
10657         {
10658           adjust = UNITS_PER_WORD  - size;
10659         }
10660     }
10661
10662   /* Get a local temporary for the field value.  */
10663   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10664
10665   /* Emit code to branch if off >= 0.  */
10666   t = build2 (GE_EXPR, boolean_type_node, off,
10667               build_int_cst (TREE_TYPE (off), 0));
10668   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10669
10670   if (dw_align)
10671     {
10672       /* Emit: offs = (offs + 15) & -16.  */
10673       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10674                   build_int_cst (TREE_TYPE (off), 15));
10675       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10676                   build_int_cst (TREE_TYPE (off), -16));
10677       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10678     }
10679   else
10680     roundup = NULL;
10681
10682   /* Update ap.__[g|v]r_offs  */
10683   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10684               build_int_cst (TREE_TYPE (off), rsize));
10685   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10686
10687   /* String up.  */
10688   if (roundup)
10689     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10690
10691   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10692   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10693               build_int_cst (TREE_TYPE (f_off), 0));
10694   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10695
10696   /* String up: make sure the assignment happens before the use.  */
10697   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10698   COND_EXPR_ELSE (cond1) = t;
10699
10700   /* Prepare the trees handling the argument that is passed on the stack;
10701      the top level node will store in ON_STACK.  */
10702   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10703   if (align > 8)
10704     {
10705       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10706       t = fold_convert (intDI_type_node, arg);
10707       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10708                   build_int_cst (TREE_TYPE (t), 15));
10709       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10710                   build_int_cst (TREE_TYPE (t), -16));
10711       t = fold_convert (TREE_TYPE (arg), t);
10712       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10713     }
10714   else
10715     roundup = NULL;
10716   /* Advance ap.__stack  */
10717   t = fold_convert (intDI_type_node, arg);
10718   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10719               build_int_cst (TREE_TYPE (t), size + 7));
10720   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10721               build_int_cst (TREE_TYPE (t), -8));
10722   t = fold_convert (TREE_TYPE (arg), t);
10723   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10724   /* String up roundup and advance.  */
10725   if (roundup)
10726     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10727   /* String up with arg */
10728   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10729   /* Big-endianness related address adjustment.  */
10730   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10731       && size < UNITS_PER_WORD)
10732   {
10733     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10734                 size_int (UNITS_PER_WORD - size));
10735     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10736   }
10737
10738   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10739   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10740
10741   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10742   t = off;
10743   if (adjust)
10744     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10745                 build_int_cst (TREE_TYPE (off), adjust));
10746
10747   t = fold_convert (sizetype, t);
10748   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10749
10750   if (is_ha)
10751     {
10752       /* type ha; // treat as "struct {ftype field[n];}"
10753          ... [computing offs]
10754          for (i = 0; i <nregs; ++i, offs += 16)
10755            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10756          return ha;  */
10757       int i;
10758       tree tmp_ha, field_t, field_ptr_t;
10759
10760       /* Declare a local variable.  */
10761       tmp_ha = create_tmp_var_raw (type, "ha");
10762       gimple_add_tmp_var (tmp_ha);
10763
10764       /* Establish the base type.  */
10765       switch (ag_mode)
10766         {
10767         case E_SFmode:
10768           field_t = float_type_node;
10769           field_ptr_t = float_ptr_type_node;
10770           break;
10771         case E_DFmode:
10772           field_t = double_type_node;
10773           field_ptr_t = double_ptr_type_node;
10774           break;
10775         case E_TFmode:
10776           field_t = long_double_type_node;
10777           field_ptr_t = long_double_ptr_type_node;
10778           break;
10779         case E_HFmode:
10780           field_t = aarch64_fp16_type_node;
10781           field_ptr_t = aarch64_fp16_ptr_type_node;
10782           break;
10783         case E_V2SImode:
10784         case E_V4SImode:
10785             {
10786               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10787               field_t = build_vector_type_for_mode (innertype, ag_mode);
10788               field_ptr_t = build_pointer_type (field_t);
10789             }
10790           break;
10791         default:
10792           gcc_assert (0);
10793         }
10794
10795       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10796       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10797       addr = t;
10798       t = fold_convert (field_ptr_t, addr);
10799       t = build2 (MODIFY_EXPR, field_t,
10800                   build1 (INDIRECT_REF, field_t, tmp_ha),
10801                   build1 (INDIRECT_REF, field_t, t));
10802
10803       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10804       for (i = 1; i < nregs; ++i)
10805         {
10806           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10807           u = fold_convert (field_ptr_t, addr);
10808           u = build2 (MODIFY_EXPR, field_t,
10809                       build2 (MEM_REF, field_t, tmp_ha,
10810                               build_int_cst (field_ptr_t,
10811                                              (i *
10812                                               int_size_in_bytes (field_t)))),
10813                       build1 (INDIRECT_REF, field_t, u));
10814           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10815         }
10816
10817       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10818       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10819     }
10820
10821   COND_EXPR_ELSE (cond2) = t;
10822   addr = fold_convert (build_pointer_type (type), cond1);
10823   addr = build_va_arg_indirect_ref (addr);
10824
10825   if (indirect_p)
10826     addr = build_va_arg_indirect_ref (addr);
10827
10828   return addr;
10829 }
10830
10831 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10832
10833 static void
10834 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10835                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10836                                 int no_rtl)
10837 {
10838   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10839   CUMULATIVE_ARGS local_cum;
10840   int gr_saved = cfun->va_list_gpr_size;
10841   int vr_saved = cfun->va_list_fpr_size;
10842
10843   /* The caller has advanced CUM up to, but not beyond, the last named
10844      argument.  Advance a local copy of CUM past the last "real" named
10845      argument, to find out how many registers are left over.  */
10846   local_cum = *cum;
10847   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10848
10849   /* Found out how many registers we need to save.
10850      Honor tree-stdvar analysis results.  */
10851   if (cfun->va_list_gpr_size)
10852     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10853                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10854   if (cfun->va_list_fpr_size)
10855     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10856                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10857
10858   if (!TARGET_FLOAT)
10859     {
10860       gcc_assert (local_cum.aapcs_nvrn == 0);
10861       vr_saved = 0;
10862     }
10863
10864   if (!no_rtl)
10865     {
10866       if (gr_saved > 0)
10867         {
10868           rtx ptr, mem;
10869
10870           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10871           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10872                                - gr_saved * UNITS_PER_WORD);
10873           mem = gen_frame_mem (BLKmode, ptr);
10874           set_mem_alias_set (mem, get_varargs_alias_set ());
10875
10876           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10877                                mem, gr_saved);
10878         }
10879       if (vr_saved > 0)
10880         {
10881           /* We can't use move_block_from_reg, because it will use
10882              the wrong mode, storing D regs only.  */
10883           machine_mode mode = TImode;
10884           int off, i, vr_start;
10885
10886           /* Set OFF to the offset from virtual_incoming_args_rtx of
10887              the first vector register.  The VR save area lies below
10888              the GR one, and is aligned to 16 bytes.  */
10889           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10890                            STACK_BOUNDARY / BITS_PER_UNIT);
10891           off -= vr_saved * UNITS_PER_VREG;
10892
10893           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10894           for (i = 0; i < vr_saved; ++i)
10895             {
10896               rtx ptr, mem;
10897
10898               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10899               mem = gen_frame_mem (mode, ptr);
10900               set_mem_alias_set (mem, get_varargs_alias_set ());
10901               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10902               off += UNITS_PER_VREG;
10903             }
10904         }
10905     }
10906
10907   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10908      any complication of having crtl->args.pretend_args_size changed.  */
10909   cfun->machine->frame.saved_varargs_size
10910     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10911                  STACK_BOUNDARY / BITS_PER_UNIT)
10912        + vr_saved * UNITS_PER_VREG);
10913 }
10914
10915 static void
10916 aarch64_conditional_register_usage (void)
10917 {
10918   int i;
10919   if (!TARGET_FLOAT)
10920     {
10921       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10922         {
10923           fixed_regs[i] = 1;
10924           call_used_regs[i] = 1;
10925         }
10926     }
10927 }
10928
10929 /* Walk down the type tree of TYPE counting consecutive base elements.
10930    If *MODEP is VOIDmode, then set it to the first valid floating point
10931    type.  If a non-floating point type is found, or if a floating point
10932    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10933    otherwise return the count in the sub-tree.  */
10934 static int
10935 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10936 {
10937   machine_mode mode;
10938   HOST_WIDE_INT size;
10939
10940   switch (TREE_CODE (type))
10941     {
10942     case REAL_TYPE:
10943       mode = TYPE_MODE (type);
10944       if (mode != DFmode && mode != SFmode
10945           && mode != TFmode && mode != HFmode)
10946         return -1;
10947
10948       if (*modep == VOIDmode)
10949         *modep = mode;
10950
10951       if (*modep == mode)
10952         return 1;
10953
10954       break;
10955
10956     case COMPLEX_TYPE:
10957       mode = TYPE_MODE (TREE_TYPE (type));
10958       if (mode != DFmode && mode != SFmode
10959           && mode != TFmode && mode != HFmode)
10960         return -1;
10961
10962       if (*modep == VOIDmode)
10963         *modep = mode;
10964
10965       if (*modep == mode)
10966         return 2;
10967
10968       break;
10969
10970     case VECTOR_TYPE:
10971       /* Use V2SImode and V4SImode as representatives of all 64-bit
10972          and 128-bit vector types.  */
10973       size = int_size_in_bytes (type);
10974       switch (size)
10975         {
10976         case 8:
10977           mode = V2SImode;
10978           break;
10979         case 16:
10980           mode = V4SImode;
10981           break;
10982         default:
10983           return -1;
10984         }
10985
10986       if (*modep == VOIDmode)
10987         *modep = mode;
10988
10989       /* Vector modes are considered to be opaque: two vectors are
10990          equivalent for the purposes of being homogeneous aggregates
10991          if they are the same size.  */
10992       if (*modep == mode)
10993         return 1;
10994
10995       break;
10996
10997     case ARRAY_TYPE:
10998       {
10999         int count;
11000         tree index = TYPE_DOMAIN (type);
11001
11002         /* Can't handle incomplete types nor sizes that are not
11003            fixed.  */
11004         if (!COMPLETE_TYPE_P (type)
11005             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11006           return -1;
11007
11008         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11009         if (count == -1
11010             || !index
11011             || !TYPE_MAX_VALUE (index)
11012             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11013             || !TYPE_MIN_VALUE (index)
11014             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11015             || count < 0)
11016           return -1;
11017
11018         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11019                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11020
11021         /* There must be no padding.  */
11022         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11023           return -1;
11024
11025         return count;
11026       }
11027
11028     case RECORD_TYPE:
11029       {
11030         int count = 0;
11031         int sub_count;
11032         tree field;
11033
11034         /* Can't handle incomplete types nor sizes that are not
11035            fixed.  */
11036         if (!COMPLETE_TYPE_P (type)
11037             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11038           return -1;
11039
11040         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11041           {
11042             if (TREE_CODE (field) != FIELD_DECL)
11043               continue;
11044
11045             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11046             if (sub_count < 0)
11047               return -1;
11048             count += sub_count;
11049           }
11050
11051         /* There must be no padding.  */
11052         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11053           return -1;
11054
11055         return count;
11056       }
11057
11058     case UNION_TYPE:
11059     case QUAL_UNION_TYPE:
11060       {
11061         /* These aren't very interesting except in a degenerate case.  */
11062         int count = 0;
11063         int sub_count;
11064         tree field;
11065
11066         /* Can't handle incomplete types nor sizes that are not
11067            fixed.  */
11068         if (!COMPLETE_TYPE_P (type)
11069             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11070           return -1;
11071
11072         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11073           {
11074             if (TREE_CODE (field) != FIELD_DECL)
11075               continue;
11076
11077             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11078             if (sub_count < 0)
11079               return -1;
11080             count = count > sub_count ? count : sub_count;
11081           }
11082
11083         /* There must be no padding.  */
11084         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11085           return -1;
11086
11087         return count;
11088       }
11089
11090     default:
11091       break;
11092     }
11093
11094   return -1;
11095 }
11096
11097 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11098    type as described in AAPCS64 \S 4.1.2.
11099
11100    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11101
11102 static bool
11103 aarch64_short_vector_p (const_tree type,
11104                         machine_mode mode)
11105 {
11106   HOST_WIDE_INT size = -1;
11107
11108   if (type && TREE_CODE (type) == VECTOR_TYPE)
11109     size = int_size_in_bytes (type);
11110   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11111             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11112     size = GET_MODE_SIZE (mode);
11113
11114   return (size == 8 || size == 16);
11115 }
11116
11117 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11118    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11119    array types.  The C99 floating-point complex types are also considered
11120    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11121    types, which are GCC extensions and out of the scope of AAPCS64, are
11122    treated as composite types here as well.
11123
11124    Note that MODE itself is not sufficient in determining whether a type
11125    is such a composite type or not.  This is because
11126    stor-layout.c:compute_record_mode may have already changed the MODE
11127    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11128    structure with only one field may have its MODE set to the mode of the
11129    field.  Also an integer mode whose size matches the size of the
11130    RECORD_TYPE type may be used to substitute the original mode
11131    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11132    solely relied on.  */
11133
11134 static bool
11135 aarch64_composite_type_p (const_tree type,
11136                           machine_mode mode)
11137 {
11138   if (aarch64_short_vector_p (type, mode))
11139     return false;
11140
11141   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11142     return true;
11143
11144   if (mode == BLKmode
11145       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11146       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11147     return true;
11148
11149   return false;
11150 }
11151
11152 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11153    shall be passed or returned in simd/fp register(s) (providing these
11154    parameter passing registers are available).
11155
11156    Upon successful return, *COUNT returns the number of needed registers,
11157    *BASE_MODE returns the mode of the individual register and when IS_HAF
11158    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11159    floating-point aggregate or a homogeneous short-vector aggregate.  */
11160
11161 static bool
11162 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11163                                          const_tree type,
11164                                          machine_mode *base_mode,
11165                                          int *count,
11166                                          bool *is_ha)
11167 {
11168   machine_mode new_mode = VOIDmode;
11169   bool composite_p = aarch64_composite_type_p (type, mode);
11170
11171   if (is_ha != NULL) *is_ha = false;
11172
11173   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11174       || aarch64_short_vector_p (type, mode))
11175     {
11176       *count = 1;
11177       new_mode = mode;
11178     }
11179   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11180     {
11181       if (is_ha != NULL) *is_ha = true;
11182       *count = 2;
11183       new_mode = GET_MODE_INNER (mode);
11184     }
11185   else if (type && composite_p)
11186     {
11187       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11188
11189       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11190         {
11191           if (is_ha != NULL) *is_ha = true;
11192           *count = ag_count;
11193         }
11194       else
11195         return false;
11196     }
11197   else
11198     return false;
11199
11200   *base_mode = new_mode;
11201   return true;
11202 }
11203
11204 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11205
11206 static rtx
11207 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11208                           int incoming ATTRIBUTE_UNUSED)
11209 {
11210   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11211 }
11212
11213 /* Implements target hook vector_mode_supported_p.  */
11214 static bool
11215 aarch64_vector_mode_supported_p (machine_mode mode)
11216 {
11217   if (TARGET_SIMD
11218       && (mode == V4SImode  || mode == V8HImode
11219           || mode == V16QImode || mode == V2DImode
11220           || mode == V2SImode  || mode == V4HImode
11221           || mode == V8QImode || mode == V2SFmode
11222           || mode == V4SFmode || mode == V2DFmode
11223           || mode == V4HFmode || mode == V8HFmode
11224           || mode == V1DFmode))
11225     return true;
11226
11227   return false;
11228 }
11229
11230 /* Return appropriate SIMD container
11231    for MODE within a vector of WIDTH bits.  */
11232 static machine_mode
11233 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11234 {
11235   gcc_assert (width == 64 || width == 128);
11236   if (TARGET_SIMD)
11237     {
11238       if (width == 128)
11239         switch (mode)
11240           {
11241           case E_DFmode:
11242             return V2DFmode;
11243           case E_SFmode:
11244             return V4SFmode;
11245           case E_HFmode:
11246             return V8HFmode;
11247           case E_SImode:
11248             return V4SImode;
11249           case E_HImode:
11250             return V8HImode;
11251           case E_QImode:
11252             return V16QImode;
11253           case E_DImode:
11254             return V2DImode;
11255           default:
11256             break;
11257           }
11258       else
11259         switch (mode)
11260           {
11261           case E_SFmode:
11262             return V2SFmode;
11263           case E_HFmode:
11264             return V4HFmode;
11265           case E_SImode:
11266             return V2SImode;
11267           case E_HImode:
11268             return V4HImode;
11269           case E_QImode:
11270             return V8QImode;
11271           default:
11272             break;
11273           }
11274     }
11275   return word_mode;
11276 }
11277
11278 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11279 static machine_mode
11280 aarch64_preferred_simd_mode (scalar_mode mode)
11281 {
11282   return aarch64_simd_container_mode (mode, 128);
11283 }
11284
11285 /* Return the bitmask of possible vector sizes for the vectorizer
11286    to iterate over.  */
11287 static unsigned int
11288 aarch64_autovectorize_vector_sizes (void)
11289 {
11290   return (16 | 8);
11291 }
11292
11293 /* Implement TARGET_MANGLE_TYPE.  */
11294
11295 static const char *
11296 aarch64_mangle_type (const_tree type)
11297 {
11298   /* The AArch64 ABI documents say that "__va_list" has to be
11299      managled as if it is in the "std" namespace.  */
11300   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11301     return "St9__va_list";
11302
11303   /* Half-precision float.  */
11304   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11305     return "Dh";
11306
11307   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11308      builtin types.  */
11309   if (TYPE_NAME (type) != NULL)
11310     return aarch64_mangle_builtin_type (type);
11311
11312   /* Use the default mangling.  */
11313   return NULL;
11314 }
11315
11316 /* Find the first rtx_insn before insn that will generate an assembly
11317    instruction.  */
11318
11319 static rtx_insn *
11320 aarch64_prev_real_insn (rtx_insn *insn)
11321 {
11322   if (!insn)
11323     return NULL;
11324
11325   do
11326     {
11327       insn = prev_real_insn (insn);
11328     }
11329   while (insn && recog_memoized (insn) < 0);
11330
11331   return insn;
11332 }
11333
11334 static bool
11335 is_madd_op (enum attr_type t1)
11336 {
11337   unsigned int i;
11338   /* A number of these may be AArch32 only.  */
11339   enum attr_type mlatypes[] = {
11340     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11341     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11342     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11343   };
11344
11345   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11346     {
11347       if (t1 == mlatypes[i])
11348         return true;
11349     }
11350
11351   return false;
11352 }
11353
11354 /* Check if there is a register dependency between a load and the insn
11355    for which we hold recog_data.  */
11356
11357 static bool
11358 dep_between_memop_and_curr (rtx memop)
11359 {
11360   rtx load_reg;
11361   int opno;
11362
11363   gcc_assert (GET_CODE (memop) == SET);
11364
11365   if (!REG_P (SET_DEST (memop)))
11366     return false;
11367
11368   load_reg = SET_DEST (memop);
11369   for (opno = 1; opno < recog_data.n_operands; opno++)
11370     {
11371       rtx operand = recog_data.operand[opno];
11372       if (REG_P (operand)
11373           && reg_overlap_mentioned_p (load_reg, operand))
11374         return true;
11375
11376     }
11377   return false;
11378 }
11379
11380
11381 /* When working around the Cortex-A53 erratum 835769,
11382    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11383    instruction and has a preceding memory instruction such that a NOP
11384    should be inserted between them.  */
11385
11386 bool
11387 aarch64_madd_needs_nop (rtx_insn* insn)
11388 {
11389   enum attr_type attr_type;
11390   rtx_insn *prev;
11391   rtx body;
11392
11393   if (!TARGET_FIX_ERR_A53_835769)
11394     return false;
11395
11396   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11397     return false;
11398
11399   attr_type = get_attr_type (insn);
11400   if (!is_madd_op (attr_type))
11401     return false;
11402
11403   prev = aarch64_prev_real_insn (insn);
11404   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11405      Restore recog state to INSN to avoid state corruption.  */
11406   extract_constrain_insn_cached (insn);
11407
11408   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11409     return false;
11410
11411   body = single_set (prev);
11412
11413   /* If the previous insn is a memory op and there is no dependency between
11414      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11415      have a complex memory operation, probably a load/store pair.
11416      Be conservative for now and emit a NOP.  */
11417   if (GET_MODE (recog_data.operand[0]) == DImode
11418       && (!body || !dep_between_memop_and_curr (body)))
11419     return true;
11420
11421   return false;
11422
11423 }
11424
11425
11426 /* Implement FINAL_PRESCAN_INSN.  */
11427
11428 void
11429 aarch64_final_prescan_insn (rtx_insn *insn)
11430 {
11431   if (aarch64_madd_needs_nop (insn))
11432     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11433 }
11434
11435
11436 /* Return the equivalent letter for size.  */
11437 static char
11438 sizetochar (int size)
11439 {
11440   switch (size)
11441     {
11442     case 64: return 'd';
11443     case 32: return 's';
11444     case 16: return 'h';
11445     case 8 : return 'b';
11446     default: gcc_unreachable ();
11447     }
11448 }
11449
11450 /* Return true iff x is a uniform vector of floating-point
11451    constants, and the constant can be represented in
11452    quarter-precision form.  Note, as aarch64_float_const_representable
11453    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11454 static bool
11455 aarch64_vect_float_const_representable_p (rtx x)
11456 {
11457   rtx elt;
11458   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11459           && const_vec_duplicate_p (x, &elt)
11460           && aarch64_float_const_representable_p (elt));
11461 }
11462
11463 /* Return true for valid and false for invalid.  */
11464 bool
11465 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11466                               struct simd_immediate_info *info)
11467 {
11468 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11469   matches = 1;                                          \
11470   for (i = 0; i < idx; i += (STRIDE))                   \
11471     if (!(TEST))                                        \
11472       matches = 0;                                      \
11473   if (matches)                                          \
11474     {                                                   \
11475       immtype = (CLASS);                                \
11476       elsize = (ELSIZE);                                \
11477       eshift = (SHIFT);                                 \
11478       emvn = (NEG);                                     \
11479       break;                                            \
11480     }
11481
11482   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11483   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11484   unsigned char bytes[16];
11485   int immtype = -1, matches;
11486   unsigned int invmask = inverse ? 0xff : 0;
11487   int eshift, emvn;
11488
11489   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11490     {
11491       if (! (aarch64_simd_imm_zero_p (op, mode)
11492              || aarch64_vect_float_const_representable_p (op)))
11493         return false;
11494
11495       if (info)
11496         {
11497           rtx elt = CONST_VECTOR_ELT (op, 0);
11498           scalar_float_mode elt_mode
11499             = as_a <scalar_float_mode> (GET_MODE (elt));
11500
11501           info->value = elt;
11502           info->element_width = GET_MODE_BITSIZE (elt_mode);
11503           info->mvn = false;
11504           info->shift = 0;
11505         }
11506
11507       return true;
11508     }
11509
11510   /* Splat vector constant out into a byte vector.  */
11511   for (i = 0; i < n_elts; i++)
11512     {
11513       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11514          it must be laid out in the vector register in reverse order.  */
11515       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11516       unsigned HOST_WIDE_INT elpart;
11517
11518       gcc_assert (CONST_INT_P (el));
11519       elpart = INTVAL (el);
11520
11521       for (unsigned int byte = 0; byte < innersize; byte++)
11522         {
11523           bytes[idx++] = (elpart & 0xff) ^ invmask;
11524           elpart >>= BITS_PER_UNIT;
11525         }
11526
11527     }
11528
11529   /* Sanity check.  */
11530   gcc_assert (idx == GET_MODE_SIZE (mode));
11531
11532   do
11533     {
11534       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11535              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11536
11537       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11538              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11539
11540       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11541              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11542
11543       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11544              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11545
11546       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11547
11548       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11549
11550       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11551              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11552
11553       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11554              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11555
11556       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11557              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11558
11559       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11560              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11561
11562       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11563
11564       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11565
11566       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11567              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11568
11569       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11570              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11571
11572       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11573              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11574
11575       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11576              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11577
11578       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11579
11580       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11581              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11582     }
11583   while (0);
11584
11585   if (immtype == -1)
11586     return false;
11587
11588   if (info)
11589     {
11590       info->element_width = elsize;
11591       info->mvn = emvn != 0;
11592       info->shift = eshift;
11593
11594       unsigned HOST_WIDE_INT imm = 0;
11595
11596       if (immtype >= 12 && immtype <= 15)
11597         info->msl = true;
11598
11599       /* Un-invert bytes of recognized vector, if necessary.  */
11600       if (invmask != 0)
11601         for (i = 0; i < idx; i++)
11602           bytes[i] ^= invmask;
11603
11604       if (immtype == 17)
11605         {
11606           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11607           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11608
11609           for (i = 0; i < 8; i++)
11610             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11611               << (i * BITS_PER_UNIT);
11612
11613
11614           info->value = GEN_INT (imm);
11615         }
11616       else
11617         {
11618           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11619             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11620
11621           /* Construct 'abcdefgh' because the assembler cannot handle
11622              generic constants.  */
11623           if (info->mvn)
11624             imm = ~imm;
11625           imm = (imm >> info->shift) & 0xff;
11626           info->value = GEN_INT (imm);
11627         }
11628     }
11629
11630   return true;
11631 #undef CHECK
11632 }
11633
11634 /* Check of immediate shift constants are within range.  */
11635 bool
11636 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11637 {
11638   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11639   if (left)
11640     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11641   else
11642     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11643 }
11644
11645 /* Return true if X is a uniform vector where all elements
11646    are either the floating-point constant 0.0 or the
11647    integer constant 0.  */
11648 bool
11649 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11650 {
11651   return x == CONST0_RTX (mode);
11652 }
11653
11654
11655 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11656    operation of width WIDTH at bit position POS.  */
11657
11658 rtx
11659 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11660 {
11661   gcc_assert (CONST_INT_P (width));
11662   gcc_assert (CONST_INT_P (pos));
11663
11664   unsigned HOST_WIDE_INT mask
11665     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11666   return GEN_INT (mask << UINTVAL (pos));
11667 }
11668
11669 bool
11670 aarch64_mov_operand_p (rtx x, machine_mode mode)
11671 {
11672   if (GET_CODE (x) == HIGH
11673       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11674     return true;
11675
11676   if (CONST_INT_P (x))
11677     return true;
11678
11679   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11680     return true;
11681
11682   return aarch64_classify_symbolic_expression (x)
11683     == SYMBOL_TINY_ABSOLUTE;
11684 }
11685
11686 /* Return a const_int vector of VAL.  */
11687 rtx
11688 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11689 {
11690   int nunits = GET_MODE_NUNITS (mode);
11691   rtvec v = rtvec_alloc (nunits);
11692   int i;
11693
11694   rtx cache = GEN_INT (val);
11695
11696   for (i=0; i < nunits; i++)
11697     RTVEC_ELT (v, i) = cache;
11698
11699   return gen_rtx_CONST_VECTOR (mode, v);
11700 }
11701
11702 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11703
11704 bool
11705 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11706 {
11707   machine_mode vmode;
11708
11709   gcc_assert (!VECTOR_MODE_P (mode));
11710   vmode = aarch64_preferred_simd_mode (as_a <scalar_mode> (mode));
11711   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11712   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11713 }
11714
11715 /* Construct and return a PARALLEL RTX vector with elements numbering the
11716    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11717    the vector - from the perspective of the architecture.  This does not
11718    line up with GCC's perspective on lane numbers, so we end up with
11719    different masks depending on our target endian-ness.  The diagram
11720    below may help.  We must draw the distinction when building masks
11721    which select one half of the vector.  An instruction selecting
11722    architectural low-lanes for a big-endian target, must be described using
11723    a mask selecting GCC high-lanes.
11724
11725                  Big-Endian             Little-Endian
11726
11727 GCC             0   1   2   3           3   2   1   0
11728               | x | x | x | x |       | x | x | x | x |
11729 Architecture    3   2   1   0           3   2   1   0
11730
11731 Low Mask:         { 2, 3 }                { 0, 1 }
11732 High Mask:        { 0, 1 }                { 2, 3 }
11733 */
11734
11735 rtx
11736 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11737 {
11738   int nunits = GET_MODE_NUNITS (mode);
11739   rtvec v = rtvec_alloc (nunits / 2);
11740   int high_base = nunits / 2;
11741   int low_base = 0;
11742   int base;
11743   rtx t1;
11744   int i;
11745
11746   if (BYTES_BIG_ENDIAN)
11747     base = high ? low_base : high_base;
11748   else
11749     base = high ? high_base : low_base;
11750
11751   for (i = 0; i < nunits / 2; i++)
11752     RTVEC_ELT (v, i) = GEN_INT (base + i);
11753
11754   t1 = gen_rtx_PARALLEL (mode, v);
11755   return t1;
11756 }
11757
11758 /* Check OP for validity as a PARALLEL RTX vector with elements
11759    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11760    from the perspective of the architecture.  See the diagram above
11761    aarch64_simd_vect_par_cnst_half for more details.  */
11762
11763 bool
11764 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11765                                        bool high)
11766 {
11767   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11768   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11769   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11770   int i = 0;
11771
11772   if (!VECTOR_MODE_P (mode))
11773     return false;
11774
11775   if (count_op != count_ideal)
11776     return false;
11777
11778   for (i = 0; i < count_ideal; i++)
11779     {
11780       rtx elt_op = XVECEXP (op, 0, i);
11781       rtx elt_ideal = XVECEXP (ideal, 0, i);
11782
11783       if (!CONST_INT_P (elt_op)
11784           || INTVAL (elt_ideal) != INTVAL (elt_op))
11785         return false;
11786     }
11787   return true;
11788 }
11789
11790 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11791    HIGH (exclusive).  */
11792 void
11793 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11794                           const_tree exp)
11795 {
11796   HOST_WIDE_INT lane;
11797   gcc_assert (CONST_INT_P (operand));
11798   lane = INTVAL (operand);
11799
11800   if (lane < low || lane >= high)
11801   {
11802     if (exp)
11803       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11804     else
11805       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11806   }
11807 }
11808
11809 /* Return TRUE if OP is a valid vector addressing mode.  */
11810 bool
11811 aarch64_simd_mem_operand_p (rtx op)
11812 {
11813   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11814                         || REG_P (XEXP (op, 0)));
11815 }
11816
11817 /* Emit a register copy from operand to operand, taking care not to
11818    early-clobber source registers in the process.
11819
11820    COUNT is the number of components into which the copy needs to be
11821    decomposed.  */
11822 void
11823 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11824                                 unsigned int count)
11825 {
11826   unsigned int i;
11827   int rdest = REGNO (operands[0]);
11828   int rsrc = REGNO (operands[1]);
11829
11830   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11831       || rdest < rsrc)
11832     for (i = 0; i < count; i++)
11833       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11834                       gen_rtx_REG (mode, rsrc + i));
11835   else
11836     for (i = 0; i < count; i++)
11837       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11838                       gen_rtx_REG (mode, rsrc + count - i - 1));
11839 }
11840
11841 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11842    one of VSTRUCT modes: OI, CI, or XI.  */
11843 int
11844 aarch64_simd_attr_length_rglist (machine_mode mode)
11845 {
11846   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11847 }
11848
11849 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11850    alignment of a vector to 128 bits.  */
11851 static HOST_WIDE_INT
11852 aarch64_simd_vector_alignment (const_tree type)
11853 {
11854   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11855   return MIN (align, 128);
11856 }
11857
11858 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11859 static bool
11860 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11861 {
11862   if (is_packed)
11863     return false;
11864
11865   /* We guarantee alignment for vectors up to 128-bits.  */
11866   if (tree_int_cst_compare (TYPE_SIZE (type),
11867                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11868     return false;
11869
11870   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11871   return true;
11872 }
11873
11874 /* Return true if the vector misalignment factor is supported by the
11875    target.  */
11876 static bool
11877 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11878                                              const_tree type, int misalignment,
11879                                              bool is_packed)
11880 {
11881   if (TARGET_SIMD && STRICT_ALIGNMENT)
11882     {
11883       /* Return if movmisalign pattern is not supported for this mode.  */
11884       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11885         return false;
11886
11887       if (misalignment == -1)
11888         {
11889           /* Misalignment factor is unknown at compile time but we know
11890              it's word aligned.  */
11891           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11892             {
11893               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11894
11895               if (element_size != 64)
11896                 return true;
11897             }
11898           return false;
11899         }
11900     }
11901   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11902                                                       is_packed);
11903 }
11904
11905 /* If VALS is a vector constant that can be loaded into a register
11906    using DUP, generate instructions to do so and return an RTX to
11907    assign to the register.  Otherwise return NULL_RTX.  */
11908 static rtx
11909 aarch64_simd_dup_constant (rtx vals)
11910 {
11911   machine_mode mode = GET_MODE (vals);
11912   machine_mode inner_mode = GET_MODE_INNER (mode);
11913   rtx x;
11914
11915   if (!const_vec_duplicate_p (vals, &x))
11916     return NULL_RTX;
11917
11918   /* We can load this constant by using DUP and a constant in a
11919      single ARM register.  This will be cheaper than a vector
11920      load.  */
11921   x = copy_to_mode_reg (inner_mode, x);
11922   return gen_rtx_VEC_DUPLICATE (mode, x);
11923 }
11924
11925
11926 /* Generate code to load VALS, which is a PARALLEL containing only
11927    constants (for vec_init) or CONST_VECTOR, efficiently into a
11928    register.  Returns an RTX to copy into the register, or NULL_RTX
11929    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11930 static rtx
11931 aarch64_simd_make_constant (rtx vals)
11932 {
11933   machine_mode mode = GET_MODE (vals);
11934   rtx const_dup;
11935   rtx const_vec = NULL_RTX;
11936   int n_elts = GET_MODE_NUNITS (mode);
11937   int n_const = 0;
11938   int i;
11939
11940   if (GET_CODE (vals) == CONST_VECTOR)
11941     const_vec = vals;
11942   else if (GET_CODE (vals) == PARALLEL)
11943     {
11944       /* A CONST_VECTOR must contain only CONST_INTs and
11945          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11946          Only store valid constants in a CONST_VECTOR.  */
11947       for (i = 0; i < n_elts; ++i)
11948         {
11949           rtx x = XVECEXP (vals, 0, i);
11950           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11951             n_const++;
11952         }
11953       if (n_const == n_elts)
11954         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11955     }
11956   else
11957     gcc_unreachable ();
11958
11959   if (const_vec != NULL_RTX
11960       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11961     /* Load using MOVI/MVNI.  */
11962     return const_vec;
11963   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11964     /* Loaded using DUP.  */
11965     return const_dup;
11966   else if (const_vec != NULL_RTX)
11967     /* Load from constant pool. We can not take advantage of single-cycle
11968        LD1 because we need a PC-relative addressing mode.  */
11969     return const_vec;
11970   else
11971     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11972        We can not construct an initializer.  */
11973     return NULL_RTX;
11974 }
11975
11976 /* Expand a vector initialisation sequence, such that TARGET is
11977    initialised to contain VALS.  */
11978
11979 void
11980 aarch64_expand_vector_init (rtx target, rtx vals)
11981 {
11982   machine_mode mode = GET_MODE (target);
11983   machine_mode inner_mode = GET_MODE_INNER (mode);
11984   /* The number of vector elements.  */
11985   int n_elts = GET_MODE_NUNITS (mode);
11986   /* The number of vector elements which are not constant.  */
11987   int n_var = 0;
11988   rtx any_const = NULL_RTX;
11989   /* The first element of vals.  */
11990   rtx v0 = XVECEXP (vals, 0, 0);
11991   bool all_same = true;
11992
11993   /* Count the number of variable elements to initialise.  */
11994   for (int i = 0; i < n_elts; ++i)
11995     {
11996       rtx x = XVECEXP (vals, 0, i);
11997       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11998         ++n_var;
11999       else
12000         any_const = x;
12001
12002       all_same &= rtx_equal_p (x, v0);
12003     }
12004
12005   /* No variable elements, hand off to aarch64_simd_make_constant which knows
12006      how best to handle this.  */
12007   if (n_var == 0)
12008     {
12009       rtx constant = aarch64_simd_make_constant (vals);
12010       if (constant != NULL_RTX)
12011         {
12012           emit_move_insn (target, constant);
12013           return;
12014         }
12015     }
12016
12017   /* Splat a single non-constant element if we can.  */
12018   if (all_same)
12019     {
12020       rtx x = copy_to_mode_reg (inner_mode, v0);
12021       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12022       return;
12023     }
12024
12025   enum insn_code icode = optab_handler (vec_set_optab, mode);
12026   gcc_assert (icode != CODE_FOR_nothing);
12027
12028   /* If there are only variable elements, try to optimize
12029      the insertion using dup for the most common element
12030      followed by insertions.  */
12031
12032   /* The algorithm will fill matches[*][0] with the earliest matching element,
12033      and matches[X][1] with the count of duplicate elements (if X is the
12034      earliest element which has duplicates).  */
12035
12036   if (n_var == n_elts && n_elts <= 16)
12037     {
12038       int matches[16][2] = {0};
12039       for (int i = 0; i < n_elts; i++)
12040         {
12041           for (int j = 0; j <= i; j++)
12042             {
12043               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12044                 {
12045                   matches[i][0] = j;
12046                   matches[j][1]++;
12047                   break;
12048                 }
12049             }
12050         }
12051       int maxelement = 0;
12052       int maxv = 0;
12053       for (int i = 0; i < n_elts; i++)
12054         if (matches[i][1] > maxv)
12055           {
12056             maxelement = i;
12057             maxv = matches[i][1];
12058           }
12059
12060       /* Create a duplicate of the most common element.  */
12061       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12062       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12063
12064       /* Insert the rest.  */
12065       for (int i = 0; i < n_elts; i++)
12066         {
12067           rtx x = XVECEXP (vals, 0, i);
12068           if (matches[i][0] == maxelement)
12069             continue;
12070           x = copy_to_mode_reg (inner_mode, x);
12071           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12072         }
12073       return;
12074     }
12075
12076   /* Initialise a vector which is part-variable.  We want to first try
12077      to build those lanes which are constant in the most efficient way we
12078      can.  */
12079   if (n_var != n_elts)
12080     {
12081       rtx copy = copy_rtx (vals);
12082
12083       /* Load constant part of vector.  We really don't care what goes into the
12084          parts we will overwrite, but we're more likely to be able to load the
12085          constant efficiently if it has fewer, larger, repeating parts
12086          (see aarch64_simd_valid_immediate).  */
12087       for (int i = 0; i < n_elts; i++)
12088         {
12089           rtx x = XVECEXP (vals, 0, i);
12090           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12091             continue;
12092           rtx subst = any_const;
12093           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12094             {
12095               /* Look in the copied vector, as more elements are const.  */
12096               rtx test = XVECEXP (copy, 0, i ^ bit);
12097               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12098                 {
12099                   subst = test;
12100                   break;
12101                 }
12102             }
12103           XVECEXP (copy, 0, i) = subst;
12104         }
12105       aarch64_expand_vector_init (target, copy);
12106     }
12107
12108   /* Insert the variable lanes directly.  */
12109   for (int i = 0; i < n_elts; i++)
12110     {
12111       rtx x = XVECEXP (vals, 0, i);
12112       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12113         continue;
12114       x = copy_to_mode_reg (inner_mode, x);
12115       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12116     }
12117 }
12118
12119 static unsigned HOST_WIDE_INT
12120 aarch64_shift_truncation_mask (machine_mode mode)
12121 {
12122   return
12123     (!SHIFT_COUNT_TRUNCATED
12124      || aarch64_vector_mode_supported_p (mode)
12125      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12126 }
12127
12128 /* Select a format to encode pointers in exception handling data.  */
12129 int
12130 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12131 {
12132    int type;
12133    switch (aarch64_cmodel)
12134      {
12135      case AARCH64_CMODEL_TINY:
12136      case AARCH64_CMODEL_TINY_PIC:
12137      case AARCH64_CMODEL_SMALL:
12138      case AARCH64_CMODEL_SMALL_PIC:
12139      case AARCH64_CMODEL_SMALL_SPIC:
12140        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12141           for everything.  */
12142        type = DW_EH_PE_sdata4;
12143        break;
12144      default:
12145        /* No assumptions here.  8-byte relocs required.  */
12146        type = DW_EH_PE_sdata8;
12147        break;
12148      }
12149    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12150 }
12151
12152 /* The last .arch and .tune assembly strings that we printed.  */
12153 static std::string aarch64_last_printed_arch_string;
12154 static std::string aarch64_last_printed_tune_string;
12155
12156 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12157    by the function fndecl.  */
12158
12159 void
12160 aarch64_declare_function_name (FILE *stream, const char* name,
12161                                 tree fndecl)
12162 {
12163   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12164
12165   struct cl_target_option *targ_options;
12166   if (target_parts)
12167     targ_options = TREE_TARGET_OPTION (target_parts);
12168   else
12169     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12170   gcc_assert (targ_options);
12171
12172   const struct processor *this_arch
12173     = aarch64_get_arch (targ_options->x_explicit_arch);
12174
12175   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12176   std::string extension
12177     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12178                                                   this_arch->flags);
12179   /* Only update the assembler .arch string if it is distinct from the last
12180      such string we printed.  */
12181   std::string to_print = this_arch->name + extension;
12182   if (to_print != aarch64_last_printed_arch_string)
12183     {
12184       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12185       aarch64_last_printed_arch_string = to_print;
12186     }
12187
12188   /* Print the cpu name we're tuning for in the comments, might be
12189      useful to readers of the generated asm.  Do it only when it changes
12190      from function to function and verbose assembly is requested.  */
12191   const struct processor *this_tune
12192     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12193
12194   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12195     {
12196       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12197                    this_tune->name);
12198       aarch64_last_printed_tune_string = this_tune->name;
12199     }
12200
12201   /* Don't forget the type directive for ELF.  */
12202   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12203   ASM_OUTPUT_LABEL (stream, name);
12204 }
12205
12206 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12207
12208 static void
12209 aarch64_start_file (void)
12210 {
12211   struct cl_target_option *default_options
12212     = TREE_TARGET_OPTION (target_option_default_node);
12213
12214   const struct processor *default_arch
12215     = aarch64_get_arch (default_options->x_explicit_arch);
12216   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12217   std::string extension
12218     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12219                                                   default_arch->flags);
12220
12221    aarch64_last_printed_arch_string = default_arch->name + extension;
12222    aarch64_last_printed_tune_string = "";
12223    asm_fprintf (asm_out_file, "\t.arch %s\n",
12224                 aarch64_last_printed_arch_string.c_str ());
12225
12226    default_file_start ();
12227 }
12228
12229 /* Emit load exclusive.  */
12230
12231 static void
12232 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12233                              rtx mem, rtx model_rtx)
12234 {
12235   rtx (*gen) (rtx, rtx, rtx);
12236
12237   switch (mode)
12238     {
12239     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12240     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12241     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12242     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12243     default:
12244       gcc_unreachable ();
12245     }
12246
12247   emit_insn (gen (rval, mem, model_rtx));
12248 }
12249
12250 /* Emit store exclusive.  */
12251
12252 static void
12253 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12254                               rtx rval, rtx mem, rtx model_rtx)
12255 {
12256   rtx (*gen) (rtx, rtx, rtx, rtx);
12257
12258   switch (mode)
12259     {
12260     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12261     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12262     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12263     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12264     default:
12265       gcc_unreachable ();
12266     }
12267
12268   emit_insn (gen (bval, rval, mem, model_rtx));
12269 }
12270
12271 /* Mark the previous jump instruction as unlikely.  */
12272
12273 static void
12274 aarch64_emit_unlikely_jump (rtx insn)
12275 {
12276   rtx_insn *jump = emit_jump_insn (insn);
12277   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12278 }
12279
12280 /* Expand a compare and swap pattern.  */
12281
12282 void
12283 aarch64_expand_compare_and_swap (rtx operands[])
12284 {
12285   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12286   machine_mode mode, cmp_mode;
12287   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12288   int idx;
12289   gen_cas_fn gen;
12290   const gen_cas_fn split_cas[] =
12291   {
12292     gen_aarch64_compare_and_swapqi,
12293     gen_aarch64_compare_and_swaphi,
12294     gen_aarch64_compare_and_swapsi,
12295     gen_aarch64_compare_and_swapdi
12296   };
12297   const gen_cas_fn atomic_cas[] =
12298   {
12299     gen_aarch64_compare_and_swapqi_lse,
12300     gen_aarch64_compare_and_swaphi_lse,
12301     gen_aarch64_compare_and_swapsi_lse,
12302     gen_aarch64_compare_and_swapdi_lse
12303   };
12304
12305   bval = operands[0];
12306   rval = operands[1];
12307   mem = operands[2];
12308   oldval = operands[3];
12309   newval = operands[4];
12310   is_weak = operands[5];
12311   mod_s = operands[6];
12312   mod_f = operands[7];
12313   mode = GET_MODE (mem);
12314   cmp_mode = mode;
12315
12316   /* Normally the succ memory model must be stronger than fail, but in the
12317      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12318      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12319
12320   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12321       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12322     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12323
12324   switch (mode)
12325     {
12326     case E_QImode:
12327     case E_HImode:
12328       /* For short modes, we're going to perform the comparison in SImode,
12329          so do the zero-extension now.  */
12330       cmp_mode = SImode;
12331       rval = gen_reg_rtx (SImode);
12332       oldval = convert_modes (SImode, mode, oldval, true);
12333       /* Fall through.  */
12334
12335     case E_SImode:
12336     case E_DImode:
12337       /* Force the value into a register if needed.  */
12338       if (!aarch64_plus_operand (oldval, mode))
12339         oldval = force_reg (cmp_mode, oldval);
12340       break;
12341
12342     default:
12343       gcc_unreachable ();
12344     }
12345
12346   switch (mode)
12347     {
12348     case E_QImode: idx = 0; break;
12349     case E_HImode: idx = 1; break;
12350     case E_SImode: idx = 2; break;
12351     case E_DImode: idx = 3; break;
12352     default:
12353       gcc_unreachable ();
12354     }
12355   if (TARGET_LSE)
12356     gen = atomic_cas[idx];
12357   else
12358     gen = split_cas[idx];
12359
12360   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12361
12362   if (mode == QImode || mode == HImode)
12363     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12364
12365   x = gen_rtx_REG (CCmode, CC_REGNUM);
12366   x = gen_rtx_EQ (SImode, x, const0_rtx);
12367   emit_insn (gen_rtx_SET (bval, x));
12368 }
12369
12370 /* Test whether the target supports using a atomic load-operate instruction.
12371    CODE is the operation and AFTER is TRUE if the data in memory after the
12372    operation should be returned and FALSE if the data before the operation
12373    should be returned.  Returns FALSE if the operation isn't supported by the
12374    architecture.  */
12375
12376 bool
12377 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12378 {
12379   if (!TARGET_LSE)
12380     return false;
12381
12382   switch (code)
12383     {
12384     case SET:
12385     case AND:
12386     case IOR:
12387     case XOR:
12388     case MINUS:
12389     case PLUS:
12390       return true;
12391     default:
12392       return false;
12393     }
12394 }
12395
12396 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12397    sequence implementing an atomic operation.  */
12398
12399 static void
12400 aarch64_emit_post_barrier (enum memmodel model)
12401 {
12402   const enum memmodel base_model = memmodel_base (model);
12403
12404   if (is_mm_sync (model)
12405       && (base_model == MEMMODEL_ACQUIRE
12406           || base_model == MEMMODEL_ACQ_REL
12407           || base_model == MEMMODEL_SEQ_CST))
12408     {
12409       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12410     }
12411 }
12412
12413 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12414    for the data in memory.  EXPECTED is the value expected to be in memory.
12415    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12416    is the memory ordering to use.  */
12417
12418 void
12419 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12420                         rtx expected, rtx desired,
12421                         rtx model)
12422 {
12423   rtx (*gen) (rtx, rtx, rtx, rtx);
12424   machine_mode mode;
12425
12426   mode = GET_MODE (mem);
12427
12428   switch (mode)
12429     {
12430     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12431     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12432     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12433     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12434     default:
12435       gcc_unreachable ();
12436     }
12437
12438   /* Move the expected value into the CAS destination register.  */
12439   emit_insn (gen_rtx_SET (rval, expected));
12440
12441   /* Emit the CAS.  */
12442   emit_insn (gen (rval, mem, desired, model));
12443
12444   /* Compare the expected value with the value loaded by the CAS, to establish
12445      whether the swap was made.  */
12446   aarch64_gen_compare_reg (EQ, rval, expected);
12447 }
12448
12449 /* Split a compare and swap pattern.  */
12450
12451 void
12452 aarch64_split_compare_and_swap (rtx operands[])
12453 {
12454   rtx rval, mem, oldval, newval, scratch;
12455   machine_mode mode;
12456   bool is_weak;
12457   rtx_code_label *label1, *label2;
12458   rtx x, cond;
12459   enum memmodel model;
12460   rtx model_rtx;
12461
12462   rval = operands[0];
12463   mem = operands[1];
12464   oldval = operands[2];
12465   newval = operands[3];
12466   is_weak = (operands[4] != const0_rtx);
12467   model_rtx = operands[5];
12468   scratch = operands[7];
12469   mode = GET_MODE (mem);
12470   model = memmodel_from_int (INTVAL (model_rtx));
12471
12472   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12473     loop:
12474     .label1:
12475         LD[A]XR rval, [mem]
12476         CBNZ    rval, .label2
12477         ST[L]XR scratch, newval, [mem]
12478         CBNZ    scratch, .label1
12479     .label2:
12480         CMP     rval, 0.  */
12481   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12482
12483   label1 = NULL;
12484   if (!is_weak)
12485     {
12486       label1 = gen_label_rtx ();
12487       emit_label (label1);
12488     }
12489   label2 = gen_label_rtx ();
12490
12491   /* The initial load can be relaxed for a __sync operation since a final
12492      barrier will be emitted to stop code hoisting.  */
12493   if (is_mm_sync (model))
12494     aarch64_emit_load_exclusive (mode, rval, mem,
12495                                  GEN_INT (MEMMODEL_RELAXED));
12496   else
12497     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12498
12499   if (strong_zero_p)
12500     {
12501       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12502       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12503                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12504       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12505     }
12506   else
12507     {
12508       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12509       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12510       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12511                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12512       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12513     }
12514
12515   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12516
12517   if (!is_weak)
12518     {
12519       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12520       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12521                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12522       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12523     }
12524   else
12525     {
12526       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12527       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12528       emit_insn (gen_rtx_SET (cond, x));
12529     }
12530
12531   emit_label (label2);
12532   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12533      to set the condition flags.  If this is not used it will be removed by
12534      later passes.  */
12535   if (strong_zero_p)
12536     {
12537       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12538       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12539       emit_insn (gen_rtx_SET (cond, x));
12540     }
12541   /* Emit any final barrier needed for a __sync operation.  */
12542   if (is_mm_sync (model))
12543     aarch64_emit_post_barrier (model);
12544 }
12545
12546 /* Emit a BIC instruction.  */
12547
12548 static void
12549 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12550 {
12551   rtx shift_rtx = GEN_INT (shift);
12552   rtx (*gen) (rtx, rtx, rtx, rtx);
12553
12554   switch (mode)
12555     {
12556     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12557     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12558     default:
12559       gcc_unreachable ();
12560     }
12561
12562   emit_insn (gen (dst, s2, shift_rtx, s1));
12563 }
12564
12565 /* Emit an atomic swap.  */
12566
12567 static void
12568 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12569                           rtx mem, rtx model)
12570 {
12571   rtx (*gen) (rtx, rtx, rtx, rtx);
12572
12573   switch (mode)
12574     {
12575     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12576     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12577     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12578     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12579     default:
12580       gcc_unreachable ();
12581     }
12582
12583   emit_insn (gen (dst, mem, value, model));
12584 }
12585
12586 /* Operations supported by aarch64_emit_atomic_load_op.  */
12587
12588 enum aarch64_atomic_load_op_code
12589 {
12590   AARCH64_LDOP_PLUS,    /* A + B  */
12591   AARCH64_LDOP_XOR,     /* A ^ B  */
12592   AARCH64_LDOP_OR,      /* A | B  */
12593   AARCH64_LDOP_BIC      /* A & ~B  */
12594 };
12595
12596 /* Emit an atomic load-operate.  */
12597
12598 static void
12599 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12600                              machine_mode mode, rtx dst, rtx src,
12601                              rtx mem, rtx model)
12602 {
12603   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12604   const aarch64_atomic_load_op_fn plus[] =
12605   {
12606     gen_aarch64_atomic_loadaddqi,
12607     gen_aarch64_atomic_loadaddhi,
12608     gen_aarch64_atomic_loadaddsi,
12609     gen_aarch64_atomic_loadadddi
12610   };
12611   const aarch64_atomic_load_op_fn eor[] =
12612   {
12613     gen_aarch64_atomic_loadeorqi,
12614     gen_aarch64_atomic_loadeorhi,
12615     gen_aarch64_atomic_loadeorsi,
12616     gen_aarch64_atomic_loadeordi
12617   };
12618   const aarch64_atomic_load_op_fn ior[] =
12619   {
12620     gen_aarch64_atomic_loadsetqi,
12621     gen_aarch64_atomic_loadsethi,
12622     gen_aarch64_atomic_loadsetsi,
12623     gen_aarch64_atomic_loadsetdi
12624   };
12625   const aarch64_atomic_load_op_fn bic[] =
12626   {
12627     gen_aarch64_atomic_loadclrqi,
12628     gen_aarch64_atomic_loadclrhi,
12629     gen_aarch64_atomic_loadclrsi,
12630     gen_aarch64_atomic_loadclrdi
12631   };
12632   aarch64_atomic_load_op_fn gen;
12633   int idx = 0;
12634
12635   switch (mode)
12636     {
12637     case E_QImode: idx = 0; break;
12638     case E_HImode: idx = 1; break;
12639     case E_SImode: idx = 2; break;
12640     case E_DImode: idx = 3; break;
12641     default:
12642       gcc_unreachable ();
12643     }
12644
12645   switch (code)
12646     {
12647     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12648     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12649     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12650     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12651     default:
12652       gcc_unreachable ();
12653     }
12654
12655   emit_insn (gen (dst, mem, src, model));
12656 }
12657
12658 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12659    location to store the data read from memory.  OUT_RESULT is the location to
12660    store the result of the operation.  MEM is the memory location to read and
12661    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12662    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12663    be NULL.  */
12664
12665 void
12666 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12667                          rtx mem, rtx value, rtx model_rtx)
12668 {
12669   machine_mode mode = GET_MODE (mem);
12670   machine_mode wmode = (mode == DImode ? DImode : SImode);
12671   const bool short_mode = (mode < SImode);
12672   aarch64_atomic_load_op_code ldop_code;
12673   rtx src;
12674   rtx x;
12675
12676   if (out_data)
12677     out_data = gen_lowpart (mode, out_data);
12678
12679   if (out_result)
12680     out_result = gen_lowpart (mode, out_result);
12681
12682   /* Make sure the value is in a register, putting it into a destination
12683      register if it needs to be manipulated.  */
12684   if (!register_operand (value, mode)
12685       || code == AND || code == MINUS)
12686     {
12687       src = out_result ? out_result : out_data;
12688       emit_move_insn (src, gen_lowpart (mode, value));
12689     }
12690   else
12691     src = value;
12692   gcc_assert (register_operand (src, mode));
12693
12694   /* Preprocess the data for the operation as necessary.  If the operation is
12695      a SET then emit a swap instruction and finish.  */
12696   switch (code)
12697     {
12698     case SET:
12699       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12700       return;
12701
12702     case MINUS:
12703       /* Negate the value and treat it as a PLUS.  */
12704       {
12705         rtx neg_src;
12706
12707         /* Resize the value if necessary.  */
12708         if (short_mode)
12709           src = gen_lowpart (wmode, src);
12710
12711         neg_src = gen_rtx_NEG (wmode, src);
12712         emit_insn (gen_rtx_SET (src, neg_src));
12713
12714         if (short_mode)
12715           src = gen_lowpart (mode, src);
12716       }
12717       /* Fall-through.  */
12718     case PLUS:
12719       ldop_code = AARCH64_LDOP_PLUS;
12720       break;
12721
12722     case IOR:
12723       ldop_code = AARCH64_LDOP_OR;
12724       break;
12725
12726     case XOR:
12727       ldop_code = AARCH64_LDOP_XOR;
12728       break;
12729
12730     case AND:
12731       {
12732         rtx not_src;
12733
12734         /* Resize the value if necessary.  */
12735         if (short_mode)
12736           src = gen_lowpart (wmode, src);
12737
12738         not_src = gen_rtx_NOT (wmode, src);
12739         emit_insn (gen_rtx_SET (src, not_src));
12740
12741         if (short_mode)
12742           src = gen_lowpart (mode, src);
12743       }
12744       ldop_code = AARCH64_LDOP_BIC;
12745       break;
12746
12747     default:
12748       /* The operation can't be done with atomic instructions.  */
12749       gcc_unreachable ();
12750     }
12751
12752   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12753
12754   /* If necessary, calculate the data in memory after the update by redoing the
12755      operation from values in registers.  */
12756   if (!out_result)
12757     return;
12758
12759   if (short_mode)
12760     {
12761       src = gen_lowpart (wmode, src);
12762       out_data = gen_lowpart (wmode, out_data);
12763       out_result = gen_lowpart (wmode, out_result);
12764     }
12765
12766   x = NULL_RTX;
12767
12768   switch (code)
12769     {
12770     case MINUS:
12771     case PLUS:
12772       x = gen_rtx_PLUS (wmode, out_data, src);
12773       break;
12774     case IOR:
12775       x = gen_rtx_IOR (wmode, out_data, src);
12776       break;
12777     case XOR:
12778       x = gen_rtx_XOR (wmode, out_data, src);
12779       break;
12780     case AND:
12781       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12782       return;
12783     default:
12784       gcc_unreachable ();
12785     }
12786
12787   emit_set_insn (out_result, x);
12788
12789   return;
12790 }
12791
12792 /* Split an atomic operation.  */
12793
12794 void
12795 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12796                          rtx value, rtx model_rtx, rtx cond)
12797 {
12798   machine_mode mode = GET_MODE (mem);
12799   machine_mode wmode = (mode == DImode ? DImode : SImode);
12800   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12801   const bool is_sync = is_mm_sync (model);
12802   rtx_code_label *label;
12803   rtx x;
12804
12805   /* Split the atomic operation into a sequence.  */
12806   label = gen_label_rtx ();
12807   emit_label (label);
12808
12809   if (new_out)
12810     new_out = gen_lowpart (wmode, new_out);
12811   if (old_out)
12812     old_out = gen_lowpart (wmode, old_out);
12813   else
12814     old_out = new_out;
12815   value = simplify_gen_subreg (wmode, value, mode, 0);
12816
12817   /* The initial load can be relaxed for a __sync operation since a final
12818      barrier will be emitted to stop code hoisting.  */
12819  if (is_sync)
12820     aarch64_emit_load_exclusive (mode, old_out, mem,
12821                                  GEN_INT (MEMMODEL_RELAXED));
12822   else
12823     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12824
12825   switch (code)
12826     {
12827     case SET:
12828       new_out = value;
12829       break;
12830
12831     case NOT:
12832       x = gen_rtx_AND (wmode, old_out, value);
12833       emit_insn (gen_rtx_SET (new_out, x));
12834       x = gen_rtx_NOT (wmode, new_out);
12835       emit_insn (gen_rtx_SET (new_out, x));
12836       break;
12837
12838     case MINUS:
12839       if (CONST_INT_P (value))
12840         {
12841           value = GEN_INT (-INTVAL (value));
12842           code = PLUS;
12843         }
12844       /* Fall through.  */
12845
12846     default:
12847       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12848       emit_insn (gen_rtx_SET (new_out, x));
12849       break;
12850     }
12851
12852   aarch64_emit_store_exclusive (mode, cond, mem,
12853                                 gen_lowpart (mode, new_out), model_rtx);
12854
12855   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12856   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12857                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12858   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12859
12860   /* Emit any final barrier needed for a __sync operation.  */
12861   if (is_sync)
12862     aarch64_emit_post_barrier (model);
12863 }
12864
12865 static void
12866 aarch64_init_libfuncs (void)
12867 {
12868    /* Half-precision float operations.  The compiler handles all operations
12869      with NULL libfuncs by converting to SFmode.  */
12870
12871   /* Conversions.  */
12872   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12873   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12874
12875   /* Arithmetic.  */
12876   set_optab_libfunc (add_optab, HFmode, NULL);
12877   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12878   set_optab_libfunc (smul_optab, HFmode, NULL);
12879   set_optab_libfunc (neg_optab, HFmode, NULL);
12880   set_optab_libfunc (sub_optab, HFmode, NULL);
12881
12882   /* Comparisons.  */
12883   set_optab_libfunc (eq_optab, HFmode, NULL);
12884   set_optab_libfunc (ne_optab, HFmode, NULL);
12885   set_optab_libfunc (lt_optab, HFmode, NULL);
12886   set_optab_libfunc (le_optab, HFmode, NULL);
12887   set_optab_libfunc (ge_optab, HFmode, NULL);
12888   set_optab_libfunc (gt_optab, HFmode, NULL);
12889   set_optab_libfunc (unord_optab, HFmode, NULL);
12890 }
12891
12892 /* Target hook for c_mode_for_suffix.  */
12893 static machine_mode
12894 aarch64_c_mode_for_suffix (char suffix)
12895 {
12896   if (suffix == 'q')
12897     return TFmode;
12898
12899   return VOIDmode;
12900 }
12901
12902 /* We can only represent floating point constants which will fit in
12903    "quarter-precision" values.  These values are characterised by
12904    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12905    by:
12906
12907    (-1)^s * (n/16) * 2^r
12908
12909    Where:
12910      's' is the sign bit.
12911      'n' is an integer in the range 16 <= n <= 31.
12912      'r' is an integer in the range -3 <= r <= 4.  */
12913
12914 /* Return true iff X can be represented by a quarter-precision
12915    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12916 bool
12917 aarch64_float_const_representable_p (rtx x)
12918 {
12919   /* This represents our current view of how many bits
12920      make up the mantissa.  */
12921   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12922   int exponent;
12923   unsigned HOST_WIDE_INT mantissa, mask;
12924   REAL_VALUE_TYPE r, m;
12925   bool fail;
12926
12927   if (!CONST_DOUBLE_P (x))
12928     return false;
12929
12930   /* We don't support HFmode constants yet.  */
12931   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12932     return false;
12933
12934   r = *CONST_DOUBLE_REAL_VALUE (x);
12935
12936   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12937      know if we have +zero until we analyse the mantissa, but we
12938      can reject the other invalid values.  */
12939   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12940       || REAL_VALUE_MINUS_ZERO (r))
12941     return false;
12942
12943   /* Extract exponent.  */
12944   r = real_value_abs (&r);
12945   exponent = REAL_EXP (&r);
12946
12947   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12948      highest (sign) bit, with a fixed binary point at bit point_pos.
12949      m1 holds the low part of the mantissa, m2 the high part.
12950      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12951      bits for the mantissa, this can fail (low bits will be lost).  */
12952   real_ldexp (&m, &r, point_pos - exponent);
12953   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12954
12955   /* If the low part of the mantissa has bits set we cannot represent
12956      the value.  */
12957   if (w.ulow () != 0)
12958     return false;
12959   /* We have rejected the lower HOST_WIDE_INT, so update our
12960      understanding of how many bits lie in the mantissa and
12961      look only at the high HOST_WIDE_INT.  */
12962   mantissa = w.elt (1);
12963   point_pos -= HOST_BITS_PER_WIDE_INT;
12964
12965   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12966   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12967   if ((mantissa & mask) != 0)
12968     return false;
12969
12970   /* Having filtered unrepresentable values, we may now remove all
12971      but the highest 5 bits.  */
12972   mantissa >>= point_pos - 5;
12973
12974   /* We cannot represent the value 0.0, so reject it.  This is handled
12975      elsewhere.  */
12976   if (mantissa == 0)
12977     return false;
12978
12979   /* Then, as bit 4 is always set, we can mask it off, leaving
12980      the mantissa in the range [0, 15].  */
12981   mantissa &= ~(1 << 4);
12982   gcc_assert (mantissa <= 15);
12983
12984   /* GCC internally does not use IEEE754-like encoding (where normalized
12985      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12986      Our mantissa values are shifted 4 places to the left relative to
12987      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12988      by 5 places to correct for GCC's representation.  */
12989   exponent = 5 - exponent;
12990
12991   return (exponent >= 0 && exponent <= 7);
12992 }
12993
12994 char*
12995 aarch64_output_simd_mov_immediate (rtx const_vector,
12996                                    machine_mode mode,
12997                                    unsigned width)
12998 {
12999   bool is_valid;
13000   static char templ[40];
13001   const char *mnemonic;
13002   const char *shift_op;
13003   unsigned int lane_count = 0;
13004   char element_char;
13005
13006   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13007
13008   /* This will return true to show const_vector is legal for use as either
13009      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
13010      also update INFO to show how the immediate should be generated.  */
13011   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13012   gcc_assert (is_valid);
13013
13014   element_char = sizetochar (info.element_width);
13015   lane_count = width / info.element_width;
13016
13017   mode = GET_MODE_INNER (mode);
13018   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13019     {
13020       gcc_assert (info.shift == 0 && ! info.mvn);
13021       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13022          move immediate path.  */
13023       if (aarch64_float_const_zero_rtx_p (info.value))
13024         info.value = GEN_INT (0);
13025       else
13026         {
13027           const unsigned int buf_size = 20;
13028           char float_buf[buf_size] = {'\0'};
13029           real_to_decimal_for_mode (float_buf,
13030                                     CONST_DOUBLE_REAL_VALUE (info.value),
13031                                     buf_size, buf_size, 1, mode);
13032
13033           if (lane_count == 1)
13034             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13035           else
13036             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13037                       lane_count, element_char, float_buf);
13038           return templ;
13039         }
13040     }
13041
13042   mnemonic = info.mvn ? "mvni" : "movi";
13043   shift_op = info.msl ? "msl" : "lsl";
13044
13045   gcc_assert (CONST_INT_P (info.value));
13046   if (lane_count == 1)
13047     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13048               mnemonic, UINTVAL (info.value));
13049   else if (info.shift)
13050     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13051               ", %s %d", mnemonic, lane_count, element_char,
13052               UINTVAL (info.value), shift_op, info.shift);
13053   else
13054     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13055               mnemonic, lane_count, element_char, UINTVAL (info.value));
13056   return templ;
13057 }
13058
13059 char*
13060 aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
13061 {
13062
13063   /* If a floating point number was passed and we desire to use it in an
13064      integer mode do the conversion to integer.  */
13065   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13066     {
13067       unsigned HOST_WIDE_INT ival;
13068       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13069           gcc_unreachable ();
13070       immediate = gen_int_mode (ival, mode);
13071     }
13072
13073   machine_mode vmode;
13074   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13075      a 128 bit vector mode.  */
13076   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13077
13078   gcc_assert (!VECTOR_MODE_P (mode));
13079   vmode = aarch64_simd_container_mode (mode, width);
13080   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13081   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13082 }
13083
13084 /* Split operands into moves from op[1] + op[2] into op[0].  */
13085
13086 void
13087 aarch64_split_combinev16qi (rtx operands[3])
13088 {
13089   unsigned int dest = REGNO (operands[0]);
13090   unsigned int src1 = REGNO (operands[1]);
13091   unsigned int src2 = REGNO (operands[2]);
13092   machine_mode halfmode = GET_MODE (operands[1]);
13093   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13094   rtx destlo, desthi;
13095
13096   gcc_assert (halfmode == V16QImode);
13097
13098   if (src1 == dest && src2 == dest + halfregs)
13099     {
13100       /* No-op move.  Can't split to nothing; emit something.  */
13101       emit_note (NOTE_INSN_DELETED);
13102       return;
13103     }
13104
13105   /* Preserve register attributes for variable tracking.  */
13106   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13107   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13108                                GET_MODE_SIZE (halfmode));
13109
13110   /* Special case of reversed high/low parts.  */
13111   if (reg_overlap_mentioned_p (operands[2], destlo)
13112       && reg_overlap_mentioned_p (operands[1], desthi))
13113     {
13114       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13115       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13116       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13117     }
13118   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13119     {
13120       /* Try to avoid unnecessary moves if part of the result
13121          is in the right place already.  */
13122       if (src1 != dest)
13123         emit_move_insn (destlo, operands[1]);
13124       if (src2 != dest + halfregs)
13125         emit_move_insn (desthi, operands[2]);
13126     }
13127   else
13128     {
13129       if (src2 != dest + halfregs)
13130         emit_move_insn (desthi, operands[2]);
13131       if (src1 != dest)
13132         emit_move_insn (destlo, operands[1]);
13133     }
13134 }
13135
13136 /* vec_perm support.  */
13137
13138 #define MAX_VECT_LEN 16
13139
13140 struct expand_vec_perm_d
13141 {
13142   rtx target, op0, op1;
13143   unsigned char perm[MAX_VECT_LEN];
13144   machine_mode vmode;
13145   unsigned char nelt;
13146   bool one_vector_p;
13147   bool testing_p;
13148 };
13149
13150 /* Generate a variable permutation.  */
13151
13152 static void
13153 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13154 {
13155   machine_mode vmode = GET_MODE (target);
13156   bool one_vector_p = rtx_equal_p (op0, op1);
13157
13158   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13159   gcc_checking_assert (GET_MODE (op0) == vmode);
13160   gcc_checking_assert (GET_MODE (op1) == vmode);
13161   gcc_checking_assert (GET_MODE (sel) == vmode);
13162   gcc_checking_assert (TARGET_SIMD);
13163
13164   if (one_vector_p)
13165     {
13166       if (vmode == V8QImode)
13167         {
13168           /* Expand the argument to a V16QI mode by duplicating it.  */
13169           rtx pair = gen_reg_rtx (V16QImode);
13170           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13171           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13172         }
13173       else
13174         {
13175           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13176         }
13177     }
13178   else
13179     {
13180       rtx pair;
13181
13182       if (vmode == V8QImode)
13183         {
13184           pair = gen_reg_rtx (V16QImode);
13185           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13186           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13187         }
13188       else
13189         {
13190           pair = gen_reg_rtx (OImode);
13191           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13192           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13193         }
13194     }
13195 }
13196
13197 void
13198 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13199 {
13200   machine_mode vmode = GET_MODE (target);
13201   unsigned int nelt = GET_MODE_NUNITS (vmode);
13202   bool one_vector_p = rtx_equal_p (op0, op1);
13203   rtx mask;
13204
13205   /* The TBL instruction does not use a modulo index, so we must take care
13206      of that ourselves.  */
13207   mask = aarch64_simd_gen_const_vector_dup (vmode,
13208       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13209   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13210
13211   /* For big-endian, we also need to reverse the index within the vector
13212      (but not which vector).  */
13213   if (BYTES_BIG_ENDIAN)
13214     {
13215       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13216       if (!one_vector_p)
13217         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13218       sel = expand_simple_binop (vmode, XOR, sel, mask,
13219                                  NULL, 0, OPTAB_LIB_WIDEN);
13220     }
13221   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13222 }
13223
13224 /* Recognize patterns suitable for the TRN instructions.  */
13225 static bool
13226 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13227 {
13228   unsigned int i, odd, mask, nelt = d->nelt;
13229   rtx out, in0, in1, x;
13230   rtx (*gen) (rtx, rtx, rtx);
13231   machine_mode vmode = d->vmode;
13232
13233   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13234     return false;
13235
13236   /* Note that these are little-endian tests.
13237      We correct for big-endian later.  */
13238   if (d->perm[0] == 0)
13239     odd = 0;
13240   else if (d->perm[0] == 1)
13241     odd = 1;
13242   else
13243     return false;
13244   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13245
13246   for (i = 0; i < nelt; i += 2)
13247     {
13248       if (d->perm[i] != i + odd)
13249         return false;
13250       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13251         return false;
13252     }
13253
13254   /* Success!  */
13255   if (d->testing_p)
13256     return true;
13257
13258   in0 = d->op0;
13259   in1 = d->op1;
13260   if (BYTES_BIG_ENDIAN)
13261     {
13262       x = in0, in0 = in1, in1 = x;
13263       odd = !odd;
13264     }
13265   out = d->target;
13266
13267   if (odd)
13268     {
13269       switch (vmode)
13270         {
13271         case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13272         case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13273         case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13274         case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13275         case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13276         case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13277         case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13278         case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13279         case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13280         case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13281         case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13282         case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13283         default:
13284           return false;
13285         }
13286     }
13287   else
13288     {
13289       switch (vmode)
13290         {
13291         case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13292         case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13293         case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13294         case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13295         case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13296         case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13297         case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13298         case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13299         case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13300         case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13301         case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13302         case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13303         default:
13304           return false;
13305         }
13306     }
13307
13308   emit_insn (gen (out, in0, in1));
13309   return true;
13310 }
13311
13312 /* Recognize patterns suitable for the UZP instructions.  */
13313 static bool
13314 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13315 {
13316   unsigned int i, odd, mask, nelt = d->nelt;
13317   rtx out, in0, in1, x;
13318   rtx (*gen) (rtx, rtx, rtx);
13319   machine_mode vmode = d->vmode;
13320
13321   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13322     return false;
13323
13324   /* Note that these are little-endian tests.
13325      We correct for big-endian later.  */
13326   if (d->perm[0] == 0)
13327     odd = 0;
13328   else if (d->perm[0] == 1)
13329     odd = 1;
13330   else
13331     return false;
13332   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13333
13334   for (i = 0; i < nelt; i++)
13335     {
13336       unsigned elt = (i * 2 + odd) & mask;
13337       if (d->perm[i] != elt)
13338         return false;
13339     }
13340
13341   /* Success!  */
13342   if (d->testing_p)
13343     return true;
13344
13345   in0 = d->op0;
13346   in1 = d->op1;
13347   if (BYTES_BIG_ENDIAN)
13348     {
13349       x = in0, in0 = in1, in1 = x;
13350       odd = !odd;
13351     }
13352   out = d->target;
13353
13354   if (odd)
13355     {
13356       switch (vmode)
13357         {
13358         case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13359         case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13360         case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13361         case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13362         case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13363         case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13364         case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13365         case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13366         case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13367         case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13368         case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13369         case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13370         default:
13371           return false;
13372         }
13373     }
13374   else
13375     {
13376       switch (vmode)
13377         {
13378         case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13379         case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13380         case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13381         case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13382         case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13383         case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13384         case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13385         case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13386         case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13387         case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13388         case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13389         case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13390         default:
13391           return false;
13392         }
13393     }
13394
13395   emit_insn (gen (out, in0, in1));
13396   return true;
13397 }
13398
13399 /* Recognize patterns suitable for the ZIP instructions.  */
13400 static bool
13401 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13402 {
13403   unsigned int i, high, mask, nelt = d->nelt;
13404   rtx out, in0, in1, x;
13405   rtx (*gen) (rtx, rtx, rtx);
13406   machine_mode vmode = d->vmode;
13407
13408   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13409     return false;
13410
13411   /* Note that these are little-endian tests.
13412      We correct for big-endian later.  */
13413   high = nelt / 2;
13414   if (d->perm[0] == high)
13415     /* Do Nothing.  */
13416     ;
13417   else if (d->perm[0] == 0)
13418     high = 0;
13419   else
13420     return false;
13421   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13422
13423   for (i = 0; i < nelt / 2; i++)
13424     {
13425       unsigned elt = (i + high) & mask;
13426       if (d->perm[i * 2] != elt)
13427         return false;
13428       elt = (elt + nelt) & mask;
13429       if (d->perm[i * 2 + 1] != elt)
13430         return false;
13431     }
13432
13433   /* Success!  */
13434   if (d->testing_p)
13435     return true;
13436
13437   in0 = d->op0;
13438   in1 = d->op1;
13439   if (BYTES_BIG_ENDIAN)
13440     {
13441       x = in0, in0 = in1, in1 = x;
13442       high = !high;
13443     }
13444   out = d->target;
13445
13446   if (high)
13447     {
13448       switch (vmode)
13449         {
13450         case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13451         case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13452         case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13453         case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13454         case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13455         case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13456         case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13457         case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13458         case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13459         case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13460         case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13461         case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13462         default:
13463           return false;
13464         }
13465     }
13466   else
13467     {
13468       switch (vmode)
13469         {
13470         case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13471         case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13472         case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13473         case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13474         case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13475         case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13476         case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13477         case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13478         case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13479         case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13480         case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13481         case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13482         default:
13483           return false;
13484         }
13485     }
13486
13487   emit_insn (gen (out, in0, in1));
13488   return true;
13489 }
13490
13491 /* Recognize patterns for the EXT insn.  */
13492
13493 static bool
13494 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13495 {
13496   unsigned int i, nelt = d->nelt;
13497   rtx (*gen) (rtx, rtx, rtx, rtx);
13498   rtx offset;
13499
13500   unsigned int location = d->perm[0]; /* Always < nelt.  */
13501
13502   /* Check if the extracted indices are increasing by one.  */
13503   for (i = 1; i < nelt; i++)
13504     {
13505       unsigned int required = location + i;
13506       if (d->one_vector_p)
13507         {
13508           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13509           required &= (nelt - 1);
13510         }
13511       if (d->perm[i] != required)
13512         return false;
13513     }
13514
13515   switch (d->vmode)
13516     {
13517     case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13518     case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13519     case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13520     case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13521     case E_V2SImode: gen = gen_aarch64_extv2si; break;
13522     case E_V4SImode: gen = gen_aarch64_extv4si; break;
13523     case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13524     case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13525     case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13526     case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13527     case E_V2DImode: gen = gen_aarch64_extv2di; break;
13528     case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13529     default:
13530       return false;
13531     }
13532
13533   /* Success! */
13534   if (d->testing_p)
13535     return true;
13536
13537   /* The case where (location == 0) is a no-op for both big- and little-endian,
13538      and is removed by the mid-end at optimization levels -O1 and higher.  */
13539
13540   if (BYTES_BIG_ENDIAN && (location != 0))
13541     {
13542       /* After setup, we want the high elements of the first vector (stored
13543          at the LSB end of the register), and the low elements of the second
13544          vector (stored at the MSB end of the register). So swap.  */
13545       std::swap (d->op0, d->op1);
13546       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13547       location = nelt - location;
13548     }
13549
13550   offset = GEN_INT (location);
13551   emit_insn (gen (d->target, d->op0, d->op1, offset));
13552   return true;
13553 }
13554
13555 /* Recognize patterns for the REV insns.  */
13556
13557 static bool
13558 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13559 {
13560   unsigned int i, j, diff, nelt = d->nelt;
13561   rtx (*gen) (rtx, rtx);
13562
13563   if (!d->one_vector_p)
13564     return false;
13565
13566   diff = d->perm[0];
13567   switch (diff)
13568     {
13569     case 7:
13570       switch (d->vmode)
13571         {
13572         case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13573         case E_V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13574         default:
13575           return false;
13576         }
13577       break;
13578     case 3:
13579       switch (d->vmode)
13580         {
13581         case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13582         case E_V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13583         case E_V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13584         case E_V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13585         default:
13586           return false;
13587         }
13588       break;
13589     case 1:
13590       switch (d->vmode)
13591         {
13592         case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13593         case E_V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13594         case E_V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13595         case E_V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13596         case E_V4SImode: gen = gen_aarch64_rev64v4si;  break;
13597         case E_V2SImode: gen = gen_aarch64_rev64v2si;  break;
13598         case E_V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13599         case E_V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13600         case E_V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13601         case E_V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13602         default:
13603           return false;
13604         }
13605       break;
13606     default:
13607       return false;
13608     }
13609
13610   for (i = 0; i < nelt ; i += diff + 1)
13611     for (j = 0; j <= diff; j += 1)
13612       {
13613         /* This is guaranteed to be true as the value of diff
13614            is 7, 3, 1 and we should have enough elements in the
13615            queue to generate this.  Getting a vector mask with a
13616            value of diff other than these values implies that
13617            something is wrong by the time we get here.  */
13618         gcc_assert (i + j < nelt);
13619         if (d->perm[i + j] != i + diff - j)
13620           return false;
13621       }
13622
13623   /* Success! */
13624   if (d->testing_p)
13625     return true;
13626
13627   emit_insn (gen (d->target, d->op0));
13628   return true;
13629 }
13630
13631 static bool
13632 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13633 {
13634   rtx (*gen) (rtx, rtx, rtx);
13635   rtx out = d->target;
13636   rtx in0;
13637   machine_mode vmode = d->vmode;
13638   unsigned int i, elt, nelt = d->nelt;
13639   rtx lane;
13640
13641   elt = d->perm[0];
13642   for (i = 1; i < nelt; i++)
13643     {
13644       if (elt != d->perm[i])
13645         return false;
13646     }
13647
13648   /* The generic preparation in aarch64_expand_vec_perm_const_1
13649      swaps the operand order and the permute indices if it finds
13650      d->perm[0] to be in the second operand.  Thus, we can always
13651      use d->op0 and need not do any extra arithmetic to get the
13652      correct lane number.  */
13653   in0 = d->op0;
13654   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13655
13656   switch (vmode)
13657     {
13658     case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13659     case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13660     case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13661     case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13662     case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13663     case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13664     case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13665     case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13666     case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13667     case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13668     case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13669     case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13670     default:
13671       return false;
13672     }
13673
13674   emit_insn (gen (out, in0, lane));
13675   return true;
13676 }
13677
13678 static bool
13679 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13680 {
13681   rtx rperm[MAX_VECT_LEN], sel;
13682   machine_mode vmode = d->vmode;
13683   unsigned int i, nelt = d->nelt;
13684
13685   if (d->testing_p)
13686     return true;
13687
13688   /* Generic code will try constant permutation twice.  Once with the
13689      original mode and again with the elements lowered to QImode.
13690      So wait and don't do the selector expansion ourselves.  */
13691   if (vmode != V8QImode && vmode != V16QImode)
13692     return false;
13693
13694   for (i = 0; i < nelt; ++i)
13695     {
13696       int nunits = GET_MODE_NUNITS (vmode);
13697
13698       /* If big-endian and two vectors we end up with a weird mixed-endian
13699          mode on NEON.  Reverse the index within each word but not the word
13700          itself.  */
13701       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13702                                            : d->perm[i]);
13703     }
13704   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13705   sel = force_reg (vmode, sel);
13706
13707   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13708   return true;
13709 }
13710
13711 static bool
13712 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13713 {
13714   /* The pattern matching functions above are written to look for a small
13715      number to begin the sequence (0, 1, N/2).  If we begin with an index
13716      from the second operand, we can swap the operands.  */
13717   if (d->perm[0] >= d->nelt)
13718     {
13719       unsigned i, nelt = d->nelt;
13720
13721       gcc_assert (nelt == (nelt & -nelt));
13722       for (i = 0; i < nelt; ++i)
13723         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13724
13725       std::swap (d->op0, d->op1);
13726     }
13727
13728   if (TARGET_SIMD)
13729     {
13730       if (aarch64_evpc_rev (d))
13731         return true;
13732       else if (aarch64_evpc_ext (d))
13733         return true;
13734       else if (aarch64_evpc_dup (d))
13735         return true;
13736       else if (aarch64_evpc_zip (d))
13737         return true;
13738       else if (aarch64_evpc_uzp (d))
13739         return true;
13740       else if (aarch64_evpc_trn (d))
13741         return true;
13742       return aarch64_evpc_tbl (d);
13743     }
13744   return false;
13745 }
13746
13747 /* Expand a vec_perm_const pattern.  */
13748
13749 bool
13750 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13751 {
13752   struct expand_vec_perm_d d;
13753   int i, nelt, which;
13754
13755   d.target = target;
13756   d.op0 = op0;
13757   d.op1 = op1;
13758
13759   d.vmode = GET_MODE (target);
13760   gcc_assert (VECTOR_MODE_P (d.vmode));
13761   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13762   d.testing_p = false;
13763
13764   for (i = which = 0; i < nelt; ++i)
13765     {
13766       rtx e = XVECEXP (sel, 0, i);
13767       int ei = INTVAL (e) & (2 * nelt - 1);
13768       which |= (ei < nelt ? 1 : 2);
13769       d.perm[i] = ei;
13770     }
13771
13772   switch (which)
13773     {
13774     default:
13775       gcc_unreachable ();
13776
13777     case 3:
13778       d.one_vector_p = false;
13779       if (!rtx_equal_p (op0, op1))
13780         break;
13781
13782       /* The elements of PERM do not suggest that only the first operand
13783          is used, but both operands are identical.  Allow easier matching
13784          of the permutation by folding the permutation into the single
13785          input vector.  */
13786       /* Fall Through.  */
13787     case 2:
13788       for (i = 0; i < nelt; ++i)
13789         d.perm[i] &= nelt - 1;
13790       d.op0 = op1;
13791       d.one_vector_p = true;
13792       break;
13793
13794     case 1:
13795       d.op1 = op0;
13796       d.one_vector_p = true;
13797       break;
13798     }
13799
13800   return aarch64_expand_vec_perm_const_1 (&d);
13801 }
13802
13803 static bool
13804 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13805                                      const unsigned char *sel)
13806 {
13807   struct expand_vec_perm_d d;
13808   unsigned int i, nelt, which;
13809   bool ret;
13810
13811   d.vmode = vmode;
13812   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13813   d.testing_p = true;
13814   memcpy (d.perm, sel, nelt);
13815
13816   /* Calculate whether all elements are in one vector.  */
13817   for (i = which = 0; i < nelt; ++i)
13818     {
13819       unsigned char e = d.perm[i];
13820       gcc_assert (e < 2 * nelt);
13821       which |= (e < nelt ? 1 : 2);
13822     }
13823
13824   /* If all elements are from the second vector, reindex as if from the
13825      first vector.  */
13826   if (which == 2)
13827     for (i = 0; i < nelt; ++i)
13828       d.perm[i] -= nelt;
13829
13830   /* Check whether the mask can be applied to a single vector.  */
13831   d.one_vector_p = (which != 3);
13832
13833   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13834   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13835   if (!d.one_vector_p)
13836     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13837
13838   start_sequence ();
13839   ret = aarch64_expand_vec_perm_const_1 (&d);
13840   end_sequence ();
13841
13842   return ret;
13843 }
13844
13845 rtx
13846 aarch64_reverse_mask (machine_mode mode)
13847 {
13848   /* We have to reverse each vector because we dont have
13849      a permuted load that can reverse-load according to ABI rules.  */
13850   rtx mask;
13851   rtvec v = rtvec_alloc (16);
13852   int i, j;
13853   int nunits = GET_MODE_NUNITS (mode);
13854   int usize = GET_MODE_UNIT_SIZE (mode);
13855
13856   gcc_assert (BYTES_BIG_ENDIAN);
13857   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13858
13859   for (i = 0; i < nunits; i++)
13860     for (j = 0; j < usize; j++)
13861       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13862   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13863   return force_reg (V16QImode, mask);
13864 }
13865
13866 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13867    However due to issues with register allocation it is preferable to avoid
13868    tieing integer scalar and FP scalar modes.  Executing integer operations
13869    in general registers is better than treating them as scalar vector
13870    operations.  This reduces latency and avoids redundant int<->FP moves.
13871    So tie modes if they are either the same class, or vector modes with
13872    other vector modes, vector structs or any scalar mode.
13873 */
13874
13875 bool
13876 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13877 {
13878   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13879     return true;
13880
13881   /* We specifically want to allow elements of "structure" modes to
13882      be tieable to the structure.  This more general condition allows
13883      other rarer situations too.  */
13884   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13885     return true;
13886
13887   /* Also allow any scalar modes with vectors.  */
13888   if (aarch64_vector_mode_supported_p (mode1)
13889       || aarch64_vector_mode_supported_p (mode2))
13890     return true;
13891
13892   return false;
13893 }
13894
13895 /* Return a new RTX holding the result of moving POINTER forward by
13896    AMOUNT bytes.  */
13897
13898 static rtx
13899 aarch64_move_pointer (rtx pointer, int amount)
13900 {
13901   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13902
13903   return adjust_automodify_address (pointer, GET_MODE (pointer),
13904                                     next, amount);
13905 }
13906
13907 /* Return a new RTX holding the result of moving POINTER forward by the
13908    size of the mode it points to.  */
13909
13910 static rtx
13911 aarch64_progress_pointer (rtx pointer)
13912 {
13913   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13914
13915   return aarch64_move_pointer (pointer, amount);
13916 }
13917
13918 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13919    MODE bytes.  */
13920
13921 static void
13922 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13923                                               machine_mode mode)
13924 {
13925   rtx reg = gen_reg_rtx (mode);
13926
13927   /* "Cast" the pointers to the correct mode.  */
13928   *src = adjust_address (*src, mode, 0);
13929   *dst = adjust_address (*dst, mode, 0);
13930   /* Emit the memcpy.  */
13931   emit_move_insn (reg, *src);
13932   emit_move_insn (*dst, reg);
13933   /* Move the pointers forward.  */
13934   *src = aarch64_progress_pointer (*src);
13935   *dst = aarch64_progress_pointer (*dst);
13936 }
13937
13938 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13939    we succeed, otherwise return false.  */
13940
13941 bool
13942 aarch64_expand_movmem (rtx *operands)
13943 {
13944   unsigned int n;
13945   rtx dst = operands[0];
13946   rtx src = operands[1];
13947   rtx base;
13948   bool speed_p = !optimize_function_for_size_p (cfun);
13949
13950   /* When optimizing for size, give a better estimate of the length of a
13951      memcpy call, but use the default otherwise.  */
13952   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13953
13954   /* We can't do anything smart if the amount to copy is not constant.  */
13955   if (!CONST_INT_P (operands[2]))
13956     return false;
13957
13958   n = UINTVAL (operands[2]);
13959
13960   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13961      need to make at most two moves.  For cases above 16 bytes it will be one
13962      move for each 16 byte chunk, then at most two additional moves.  */
13963   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13964     return false;
13965
13966   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13967   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13968
13969   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13970   src = adjust_automodify_address (src, VOIDmode, base, 0);
13971
13972   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13973      1-byte chunk.  */
13974   if (n < 4)
13975     {
13976       if (n >= 2)
13977         {
13978           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13979           n -= 2;
13980         }
13981
13982       if (n == 1)
13983         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13984
13985       return true;
13986     }
13987
13988   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13989      4-byte chunk, partially overlapping with the previously copied chunk.  */
13990   if (n < 8)
13991     {
13992       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13993       n -= 4;
13994       if (n > 0)
13995         {
13996           int move = n - 4;
13997
13998           src = aarch64_move_pointer (src, move);
13999           dst = aarch64_move_pointer (dst, move);
14000           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14001         }
14002       return true;
14003     }
14004
14005   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
14006      them, then (if applicable) an 8-byte chunk.  */
14007   while (n >= 8)
14008     {
14009       if (n / 16)
14010         {
14011           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14012           n -= 16;
14013         }
14014       else
14015         {
14016           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14017           n -= 8;
14018         }
14019     }
14020
14021   /* Finish the final bytes of the copy.  We can always do this in one
14022      instruction.  We either copy the exact amount we need, or partially
14023      overlap with the previous chunk we copied and copy 8-bytes.  */
14024   if (n == 0)
14025     return true;
14026   else if (n == 1)
14027     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14028   else if (n == 2)
14029     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14030   else if (n == 4)
14031     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14032   else
14033     {
14034       if (n == 3)
14035         {
14036           src = aarch64_move_pointer (src, -1);
14037           dst = aarch64_move_pointer (dst, -1);
14038           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14039         }
14040       else
14041         {
14042           int move = n - 8;
14043
14044           src = aarch64_move_pointer (src, move);
14045           dst = aarch64_move_pointer (dst, move);
14046           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14047         }
14048     }
14049
14050   return true;
14051 }
14052
14053 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14054    SImode stores.  Handle the case when the constant has identical
14055    bottom and top halves.  This is beneficial when the two stores can be
14056    merged into an STP and we avoid synthesising potentially expensive
14057    immediates twice.  Return true if such a split is possible.  */
14058
14059 bool
14060 aarch64_split_dimode_const_store (rtx dst, rtx src)
14061 {
14062   rtx lo = gen_lowpart (SImode, src);
14063   rtx hi = gen_highpart_mode (SImode, DImode, src);
14064
14065   bool size_p = optimize_function_for_size_p (cfun);
14066
14067   if (!rtx_equal_p (lo, hi))
14068     return false;
14069
14070   unsigned int orig_cost
14071     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14072   unsigned int lo_cost
14073     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14074
14075   /* We want to transform:
14076      MOV        x1, 49370
14077      MOVK       x1, 0x140, lsl 16
14078      MOVK       x1, 0xc0da, lsl 32
14079      MOVK       x1, 0x140, lsl 48
14080      STR        x1, [x0]
14081    into:
14082      MOV        w1, 49370
14083      MOVK       w1, 0x140, lsl 16
14084      STP        w1, w1, [x0]
14085    So we want to perform this only when we save two instructions
14086    or more.  When optimizing for size, however, accept any code size
14087    savings we can.  */
14088   if (size_p && orig_cost <= lo_cost)
14089     return false;
14090
14091   if (!size_p
14092       && (orig_cost <= lo_cost + 1))
14093     return false;
14094
14095   rtx mem_lo = adjust_address (dst, SImode, 0);
14096   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14097     return false;
14098
14099   rtx tmp_reg = gen_reg_rtx (SImode);
14100   aarch64_expand_mov_immediate (tmp_reg, lo);
14101   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14102   /* Don't emit an explicit store pair as this may not be always profitable.
14103      Let the sched-fusion logic decide whether to merge them.  */
14104   emit_move_insn (mem_lo, tmp_reg);
14105   emit_move_insn (mem_hi, tmp_reg);
14106
14107   return true;
14108 }
14109
14110 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14111
14112 static unsigned HOST_WIDE_INT
14113 aarch64_asan_shadow_offset (void)
14114 {
14115   return (HOST_WIDE_INT_1 << 36);
14116 }
14117
14118 static bool
14119 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14120                                         unsigned int align,
14121                                         enum by_pieces_operation op,
14122                                         bool speed_p)
14123 {
14124   /* STORE_BY_PIECES can be used when copying a constant string, but
14125      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14126      For now we always fail this and let the move_by_pieces code copy
14127      the string from read-only memory.  */
14128   if (op == STORE_BY_PIECES)
14129     return false;
14130
14131   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14132 }
14133
14134 static rtx
14135 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14136                         int code, tree treeop0, tree treeop1)
14137 {
14138   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14139   rtx op0, op1;
14140   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14141   insn_code icode;
14142   struct expand_operand ops[4];
14143
14144   start_sequence ();
14145   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14146
14147   op_mode = GET_MODE (op0);
14148   if (op_mode == VOIDmode)
14149     op_mode = GET_MODE (op1);
14150
14151   switch (op_mode)
14152     {
14153     case E_QImode:
14154     case E_HImode:
14155     case E_SImode:
14156       cmp_mode = SImode;
14157       icode = CODE_FOR_cmpsi;
14158       break;
14159
14160     case E_DImode:
14161       cmp_mode = DImode;
14162       icode = CODE_FOR_cmpdi;
14163       break;
14164
14165     case E_SFmode:
14166       cmp_mode = SFmode;
14167       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14168       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14169       break;
14170
14171     case E_DFmode:
14172       cmp_mode = DFmode;
14173       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14174       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14175       break;
14176
14177     default:
14178       end_sequence ();
14179       return NULL_RTX;
14180     }
14181
14182   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14183   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14184   if (!op0 || !op1)
14185     {
14186       end_sequence ();
14187       return NULL_RTX;
14188     }
14189   *prep_seq = get_insns ();
14190   end_sequence ();
14191
14192   create_fixed_operand (&ops[0], op0);
14193   create_fixed_operand (&ops[1], op1);
14194
14195   start_sequence ();
14196   if (!maybe_expand_insn (icode, 2, ops))
14197     {
14198       end_sequence ();
14199       return NULL_RTX;
14200     }
14201   *gen_seq = get_insns ();
14202   end_sequence ();
14203
14204   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14205                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14206 }
14207
14208 static rtx
14209 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14210                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14211 {
14212   rtx op0, op1, target;
14213   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14214   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14215   insn_code icode;
14216   struct expand_operand ops[6];
14217   int aarch64_cond;
14218
14219   push_to_sequence (*prep_seq);
14220   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14221
14222   op_mode = GET_MODE (op0);
14223   if (op_mode == VOIDmode)
14224     op_mode = GET_MODE (op1);
14225
14226   switch (op_mode)
14227     {
14228     case E_QImode:
14229     case E_HImode:
14230     case E_SImode:
14231       cmp_mode = SImode;
14232       icode = CODE_FOR_ccmpsi;
14233       break;
14234
14235     case E_DImode:
14236       cmp_mode = DImode;
14237       icode = CODE_FOR_ccmpdi;
14238       break;
14239
14240     case E_SFmode:
14241       cmp_mode = SFmode;
14242       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14243       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14244       break;
14245
14246     case E_DFmode:
14247       cmp_mode = DFmode;
14248       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14249       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14250       break;
14251
14252     default:
14253       end_sequence ();
14254       return NULL_RTX;
14255     }
14256
14257   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14258   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14259   if (!op0 || !op1)
14260     {
14261       end_sequence ();
14262       return NULL_RTX;
14263     }
14264   *prep_seq = get_insns ();
14265   end_sequence ();
14266
14267   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14268   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14269
14270   if (bit_code != AND)
14271     {
14272       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14273                                                 GET_MODE (XEXP (prev, 0))),
14274                              VOIDmode, XEXP (prev, 0), const0_rtx);
14275       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14276     }
14277
14278   create_fixed_operand (&ops[0], XEXP (prev, 0));
14279   create_fixed_operand (&ops[1], target);
14280   create_fixed_operand (&ops[2], op0);
14281   create_fixed_operand (&ops[3], op1);
14282   create_fixed_operand (&ops[4], prev);
14283   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14284
14285   push_to_sequence (*gen_seq);
14286   if (!maybe_expand_insn (icode, 6, ops))
14287     {
14288       end_sequence ();
14289       return NULL_RTX;
14290     }
14291
14292   *gen_seq = get_insns ();
14293   end_sequence ();
14294
14295   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14296 }
14297
14298 #undef TARGET_GEN_CCMP_FIRST
14299 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14300
14301 #undef TARGET_GEN_CCMP_NEXT
14302 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14303
14304 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14305    instruction fusion of some sort.  */
14306
14307 static bool
14308 aarch64_macro_fusion_p (void)
14309 {
14310   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14311 }
14312
14313
14314 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14315    should be kept together during scheduling.  */
14316
14317 static bool
14318 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14319 {
14320   rtx set_dest;
14321   rtx prev_set = single_set (prev);
14322   rtx curr_set = single_set (curr);
14323   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14324   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14325
14326   if (!aarch64_macro_fusion_p ())
14327     return false;
14328
14329   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14330     {
14331       /* We are trying to match:
14332          prev (mov)  == (set (reg r0) (const_int imm16))
14333          curr (movk) == (set (zero_extract (reg r0)
14334                                            (const_int 16)
14335                                            (const_int 16))
14336                              (const_int imm16_1))  */
14337
14338       set_dest = SET_DEST (curr_set);
14339
14340       if (GET_CODE (set_dest) == ZERO_EXTRACT
14341           && CONST_INT_P (SET_SRC (curr_set))
14342           && CONST_INT_P (SET_SRC (prev_set))
14343           && CONST_INT_P (XEXP (set_dest, 2))
14344           && INTVAL (XEXP (set_dest, 2)) == 16
14345           && REG_P (XEXP (set_dest, 0))
14346           && REG_P (SET_DEST (prev_set))
14347           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14348         {
14349           return true;
14350         }
14351     }
14352
14353   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14354     {
14355
14356       /*  We're trying to match:
14357           prev (adrp) == (set (reg r1)
14358                               (high (symbol_ref ("SYM"))))
14359           curr (add) == (set (reg r0)
14360                              (lo_sum (reg r1)
14361                                      (symbol_ref ("SYM"))))
14362           Note that r0 need not necessarily be the same as r1, especially
14363           during pre-regalloc scheduling.  */
14364
14365       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14366           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14367         {
14368           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14369               && REG_P (XEXP (SET_SRC (curr_set), 0))
14370               && REGNO (XEXP (SET_SRC (curr_set), 0))
14371                  == REGNO (SET_DEST (prev_set))
14372               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14373                               XEXP (SET_SRC (curr_set), 1)))
14374             return true;
14375         }
14376     }
14377
14378   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14379     {
14380
14381       /* We're trying to match:
14382          prev (movk) == (set (zero_extract (reg r0)
14383                                            (const_int 16)
14384                                            (const_int 32))
14385                              (const_int imm16_1))
14386          curr (movk) == (set (zero_extract (reg r0)
14387                                            (const_int 16)
14388                                            (const_int 48))
14389                              (const_int imm16_2))  */
14390
14391       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14392           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14393           && REG_P (XEXP (SET_DEST (prev_set), 0))
14394           && REG_P (XEXP (SET_DEST (curr_set), 0))
14395           && REGNO (XEXP (SET_DEST (prev_set), 0))
14396              == REGNO (XEXP (SET_DEST (curr_set), 0))
14397           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14398           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14399           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14400           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14401           && CONST_INT_P (SET_SRC (prev_set))
14402           && CONST_INT_P (SET_SRC (curr_set)))
14403         return true;
14404
14405     }
14406   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14407     {
14408       /* We're trying to match:
14409           prev (adrp) == (set (reg r0)
14410                               (high (symbol_ref ("SYM"))))
14411           curr (ldr) == (set (reg r1)
14412                              (mem (lo_sum (reg r0)
14413                                              (symbol_ref ("SYM")))))
14414                  or
14415           curr (ldr) == (set (reg r1)
14416                              (zero_extend (mem
14417                                            (lo_sum (reg r0)
14418                                                    (symbol_ref ("SYM"))))))  */
14419       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14420           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14421         {
14422           rtx curr_src = SET_SRC (curr_set);
14423
14424           if (GET_CODE (curr_src) == ZERO_EXTEND)
14425             curr_src = XEXP (curr_src, 0);
14426
14427           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14428               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14429               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14430                  == REGNO (SET_DEST (prev_set))
14431               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14432                               XEXP (SET_SRC (prev_set), 0)))
14433               return true;
14434         }
14435     }
14436
14437   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14438        && aarch_crypto_can_dual_issue (prev, curr))
14439     return true;
14440
14441   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14442       && any_condjump_p (curr))
14443     {
14444       enum attr_type prev_type = get_attr_type (prev);
14445
14446       unsigned int condreg1, condreg2;
14447       rtx cc_reg_1;
14448       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14449       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14450
14451       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14452           && prev
14453           && modified_in_p (cc_reg_1, prev))
14454         {
14455           /* FIXME: this misses some which is considered simple arthematic
14456              instructions for ThunderX.  Simple shifts are missed here.  */
14457           if (prev_type == TYPE_ALUS_SREG
14458               || prev_type == TYPE_ALUS_IMM
14459               || prev_type == TYPE_LOGICS_REG
14460               || prev_type == TYPE_LOGICS_IMM)
14461             return true;
14462         }
14463     }
14464
14465   if (prev_set
14466       && curr_set
14467       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14468       && any_condjump_p (curr))
14469     {
14470       /* We're trying to match:
14471           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14472           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14473                                                          (const_int 0))
14474                                                  (label_ref ("SYM"))
14475                                                  (pc))  */
14476       if (SET_DEST (curr_set) == (pc_rtx)
14477           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14478           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14479           && REG_P (SET_DEST (prev_set))
14480           && REGNO (SET_DEST (prev_set))
14481              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14482         {
14483           /* Fuse ALU operations followed by conditional branch instruction.  */
14484           switch (get_attr_type (prev))
14485             {
14486             case TYPE_ALU_IMM:
14487             case TYPE_ALU_SREG:
14488             case TYPE_ADC_REG:
14489             case TYPE_ADC_IMM:
14490             case TYPE_ADCS_REG:
14491             case TYPE_ADCS_IMM:
14492             case TYPE_LOGIC_REG:
14493             case TYPE_LOGIC_IMM:
14494             case TYPE_CSEL:
14495             case TYPE_ADR:
14496             case TYPE_MOV_IMM:
14497             case TYPE_SHIFT_REG:
14498             case TYPE_SHIFT_IMM:
14499             case TYPE_BFM:
14500             case TYPE_RBIT:
14501             case TYPE_REV:
14502             case TYPE_EXTEND:
14503               return true;
14504
14505             default:;
14506             }
14507         }
14508     }
14509
14510   return false;
14511 }
14512
14513 /* Return true iff the instruction fusion described by OP is enabled.  */
14514
14515 bool
14516 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14517 {
14518   return (aarch64_tune_params.fusible_ops & op) != 0;
14519 }
14520
14521 /* If MEM is in the form of [base+offset], extract the two parts
14522    of address and set to BASE and OFFSET, otherwise return false
14523    after clearing BASE and OFFSET.  */
14524
14525 bool
14526 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14527 {
14528   rtx addr;
14529
14530   gcc_assert (MEM_P (mem));
14531
14532   addr = XEXP (mem, 0);
14533
14534   if (REG_P (addr))
14535     {
14536       *base = addr;
14537       *offset = const0_rtx;
14538       return true;
14539     }
14540
14541   if (GET_CODE (addr) == PLUS
14542       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14543     {
14544       *base = XEXP (addr, 0);
14545       *offset = XEXP (addr, 1);
14546       return true;
14547     }
14548
14549   *base = NULL_RTX;
14550   *offset = NULL_RTX;
14551
14552   return false;
14553 }
14554
14555 /* Types for scheduling fusion.  */
14556 enum sched_fusion_type
14557 {
14558   SCHED_FUSION_NONE = 0,
14559   SCHED_FUSION_LD_SIGN_EXTEND,
14560   SCHED_FUSION_LD_ZERO_EXTEND,
14561   SCHED_FUSION_LD,
14562   SCHED_FUSION_ST,
14563   SCHED_FUSION_NUM
14564 };
14565
14566 /* If INSN is a load or store of address in the form of [base+offset],
14567    extract the two parts and set to BASE and OFFSET.  Return scheduling
14568    fusion type this INSN is.  */
14569
14570 static enum sched_fusion_type
14571 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14572 {
14573   rtx x, dest, src;
14574   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14575
14576   gcc_assert (INSN_P (insn));
14577   x = PATTERN (insn);
14578   if (GET_CODE (x) != SET)
14579     return SCHED_FUSION_NONE;
14580
14581   src = SET_SRC (x);
14582   dest = SET_DEST (x);
14583
14584   machine_mode dest_mode = GET_MODE (dest);
14585
14586   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14587     return SCHED_FUSION_NONE;
14588
14589   if (GET_CODE (src) == SIGN_EXTEND)
14590     {
14591       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14592       src = XEXP (src, 0);
14593       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14594         return SCHED_FUSION_NONE;
14595     }
14596   else if (GET_CODE (src) == ZERO_EXTEND)
14597     {
14598       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14599       src = XEXP (src, 0);
14600       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14601         return SCHED_FUSION_NONE;
14602     }
14603
14604   if (GET_CODE (src) == MEM && REG_P (dest))
14605     extract_base_offset_in_addr (src, base, offset);
14606   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14607     {
14608       fusion = SCHED_FUSION_ST;
14609       extract_base_offset_in_addr (dest, base, offset);
14610     }
14611   else
14612     return SCHED_FUSION_NONE;
14613
14614   if (*base == NULL_RTX || *offset == NULL_RTX)
14615     fusion = SCHED_FUSION_NONE;
14616
14617   return fusion;
14618 }
14619
14620 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14621
14622    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14623    and PRI are only calculated for these instructions.  For other instruction,
14624    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14625    type instruction fusion can be added by returning different priorities.
14626
14627    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14628
14629 static void
14630 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14631                                int *fusion_pri, int *pri)
14632 {
14633   int tmp, off_val;
14634   rtx base, offset;
14635   enum sched_fusion_type fusion;
14636
14637   gcc_assert (INSN_P (insn));
14638
14639   tmp = max_pri - 1;
14640   fusion = fusion_load_store (insn, &base, &offset);
14641   if (fusion == SCHED_FUSION_NONE)
14642     {
14643       *pri = tmp;
14644       *fusion_pri = tmp;
14645       return;
14646     }
14647
14648   /* Set FUSION_PRI according to fusion type and base register.  */
14649   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14650
14651   /* Calculate PRI.  */
14652   tmp /= 2;
14653
14654   /* INSN with smaller offset goes first.  */
14655   off_val = (int)(INTVAL (offset));
14656   if (off_val >= 0)
14657     tmp -= (off_val & 0xfffff);
14658   else
14659     tmp += ((- off_val) & 0xfffff);
14660
14661   *pri = tmp;
14662   return;
14663 }
14664
14665 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14666    Adjust priority of sha1h instructions so they are scheduled before
14667    other SHA1 instructions.  */
14668
14669 static int
14670 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14671 {
14672   rtx x = PATTERN (insn);
14673
14674   if (GET_CODE (x) == SET)
14675     {
14676       x = SET_SRC (x);
14677
14678       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14679         return priority + 10;
14680     }
14681
14682   return priority;
14683 }
14684
14685 /* Given OPERANDS of consecutive load/store, check if we can merge
14686    them into ldp/stp.  LOAD is true if they are load instructions.
14687    MODE is the mode of memory operands.  */
14688
14689 bool
14690 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14691                                 machine_mode mode)
14692 {
14693   HOST_WIDE_INT offval_1, offval_2, msize;
14694   enum reg_class rclass_1, rclass_2;
14695   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14696
14697   if (load)
14698     {
14699       mem_1 = operands[1];
14700       mem_2 = operands[3];
14701       reg_1 = operands[0];
14702       reg_2 = operands[2];
14703       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14704       if (REGNO (reg_1) == REGNO (reg_2))
14705         return false;
14706     }
14707   else
14708     {
14709       mem_1 = operands[0];
14710       mem_2 = operands[2];
14711       reg_1 = operands[1];
14712       reg_2 = operands[3];
14713     }
14714
14715   /* The mems cannot be volatile.  */
14716   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14717     return false;
14718
14719   /* If we have SImode and slow unaligned ldp,
14720      check the alignment to be at least 8 byte. */
14721   if (mode == SImode
14722       && (aarch64_tune_params.extra_tuning_flags
14723           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14724       && !optimize_size
14725       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14726     return false;
14727
14728   /* Check if the addresses are in the form of [base+offset].  */
14729   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14730   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14731     return false;
14732   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14733   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14734     return false;
14735
14736   /* Check if the bases are same.  */
14737   if (!rtx_equal_p (base_1, base_2))
14738     return false;
14739
14740   offval_1 = INTVAL (offset_1);
14741   offval_2 = INTVAL (offset_2);
14742   msize = GET_MODE_SIZE (mode);
14743   /* Check if the offsets are consecutive.  */
14744   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14745     return false;
14746
14747   /* Check if the addresses are clobbered by load.  */
14748   if (load)
14749     {
14750       if (reg_mentioned_p (reg_1, mem_1))
14751         return false;
14752
14753       /* In increasing order, the last load can clobber the address.  */
14754       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14755       return false;
14756     }
14757
14758   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14759     rclass_1 = FP_REGS;
14760   else
14761     rclass_1 = GENERAL_REGS;
14762
14763   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14764     rclass_2 = FP_REGS;
14765   else
14766     rclass_2 = GENERAL_REGS;
14767
14768   /* Check if the registers are of same class.  */
14769   if (rclass_1 != rclass_2)
14770     return false;
14771
14772   return true;
14773 }
14774
14775 /* Given OPERANDS of consecutive load/store, check if we can merge
14776    them into ldp/stp by adjusting the offset.  LOAD is true if they
14777    are load instructions.  MODE is the mode of memory operands.
14778
14779    Given below consecutive stores:
14780
14781      str  w1, [xb, 0x100]
14782      str  w1, [xb, 0x104]
14783      str  w1, [xb, 0x108]
14784      str  w1, [xb, 0x10c]
14785
14786    Though the offsets are out of the range supported by stp, we can
14787    still pair them after adjusting the offset, like:
14788
14789      add  scratch, xb, 0x100
14790      stp  w1, w1, [scratch]
14791      stp  w1, w1, [scratch, 0x8]
14792
14793    The peephole patterns detecting this opportunity should guarantee
14794    the scratch register is avaliable.  */
14795
14796 bool
14797 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14798                                        machine_mode mode)
14799 {
14800   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14801   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14802   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14803   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14804
14805   if (load)
14806     {
14807       reg_1 = operands[0];
14808       mem_1 = operands[1];
14809       reg_2 = operands[2];
14810       mem_2 = operands[3];
14811       reg_3 = operands[4];
14812       mem_3 = operands[5];
14813       reg_4 = operands[6];
14814       mem_4 = operands[7];
14815       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14816                   && REG_P (reg_3) && REG_P (reg_4));
14817       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14818         return false;
14819     }
14820   else
14821     {
14822       mem_1 = operands[0];
14823       reg_1 = operands[1];
14824       mem_2 = operands[2];
14825       reg_2 = operands[3];
14826       mem_3 = operands[4];
14827       reg_3 = operands[5];
14828       mem_4 = operands[6];
14829       reg_4 = operands[7];
14830     }
14831   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14832   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14833     return false;
14834
14835   /* The mems cannot be volatile.  */
14836   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14837       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14838     return false;
14839
14840   /* Check if the addresses are in the form of [base+offset].  */
14841   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14842   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14843     return false;
14844   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14845   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14846     return false;
14847   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14848   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14849     return false;
14850   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14851   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14852     return false;
14853
14854   /* Check if the bases are same.  */
14855   if (!rtx_equal_p (base_1, base_2)
14856       || !rtx_equal_p (base_2, base_3)
14857       || !rtx_equal_p (base_3, base_4))
14858     return false;
14859
14860   offval_1 = INTVAL (offset_1);
14861   offval_2 = INTVAL (offset_2);
14862   offval_3 = INTVAL (offset_3);
14863   offval_4 = INTVAL (offset_4);
14864   msize = GET_MODE_SIZE (mode);
14865   /* Check if the offsets are consecutive.  */
14866   if ((offval_1 != (offval_2 + msize)
14867        || offval_1 != (offval_3 + msize * 2)
14868        || offval_1 != (offval_4 + msize * 3))
14869       && (offval_4 != (offval_3 + msize)
14870           || offval_4 != (offval_2 + msize * 2)
14871           || offval_4 != (offval_1 + msize * 3)))
14872     return false;
14873
14874   /* Check if the addresses are clobbered by load.  */
14875   if (load)
14876     {
14877       if (reg_mentioned_p (reg_1, mem_1)
14878           || reg_mentioned_p (reg_2, mem_2)
14879           || reg_mentioned_p (reg_3, mem_3))
14880         return false;
14881
14882       /* In increasing order, the last load can clobber the address.  */
14883       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14884         return false;
14885     }
14886
14887   /* If we have SImode and slow unaligned ldp,
14888      check the alignment to be at least 8 byte. */
14889   if (mode == SImode
14890       && (aarch64_tune_params.extra_tuning_flags
14891           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14892       && !optimize_size
14893       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14894     return false;
14895
14896   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14897     rclass_1 = FP_REGS;
14898   else
14899     rclass_1 = GENERAL_REGS;
14900
14901   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14902     rclass_2 = FP_REGS;
14903   else
14904     rclass_2 = GENERAL_REGS;
14905
14906   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14907     rclass_3 = FP_REGS;
14908   else
14909     rclass_3 = GENERAL_REGS;
14910
14911   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14912     rclass_4 = FP_REGS;
14913   else
14914     rclass_4 = GENERAL_REGS;
14915
14916   /* Check if the registers are of same class.  */
14917   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14918     return false;
14919
14920   return true;
14921 }
14922
14923 /* Given OPERANDS of consecutive load/store, this function pairs them
14924    into ldp/stp after adjusting the offset.  It depends on the fact
14925    that addresses of load/store instructions are in increasing order.
14926    MODE is the mode of memory operands.  CODE is the rtl operator
14927    which should be applied to all memory operands, it's SIGN_EXTEND,
14928    ZERO_EXTEND or UNKNOWN.  */
14929
14930 bool
14931 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14932                              machine_mode mode, RTX_CODE code)
14933 {
14934   rtx base, offset, t1, t2;
14935   rtx mem_1, mem_2, mem_3, mem_4;
14936   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14937
14938   if (load)
14939     {
14940       mem_1 = operands[1];
14941       mem_2 = operands[3];
14942       mem_3 = operands[5];
14943       mem_4 = operands[7];
14944     }
14945   else
14946     {
14947       mem_1 = operands[0];
14948       mem_2 = operands[2];
14949       mem_3 = operands[4];
14950       mem_4 = operands[6];
14951       gcc_assert (code == UNKNOWN);
14952     }
14953
14954   extract_base_offset_in_addr (mem_1, &base, &offset);
14955   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14956
14957   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14958   msize = GET_MODE_SIZE (mode);
14959   stp_off_limit = msize * 0x40;
14960   off_val = INTVAL (offset);
14961   abs_off = (off_val < 0) ? -off_val : off_val;
14962   new_off = abs_off % stp_off_limit;
14963   adj_off = abs_off - new_off;
14964
14965   /* Further adjust to make sure all offsets are OK.  */
14966   if ((new_off + msize * 2) >= stp_off_limit)
14967     {
14968       adj_off += stp_off_limit;
14969       new_off -= stp_off_limit;
14970     }
14971
14972   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14973   if (adj_off >= 0x1000)
14974     return false;
14975
14976   if (off_val < 0)
14977     {
14978       adj_off = -adj_off;
14979       new_off = -new_off;
14980     }
14981
14982   /* Create new memory references.  */
14983   mem_1 = change_address (mem_1, VOIDmode,
14984                           plus_constant (DImode, operands[8], new_off));
14985
14986   /* Check if the adjusted address is OK for ldp/stp.  */
14987   if (!aarch64_mem_pair_operand (mem_1, mode))
14988     return false;
14989
14990   msize = GET_MODE_SIZE (mode);
14991   mem_2 = change_address (mem_2, VOIDmode,
14992                           plus_constant (DImode,
14993                                          operands[8],
14994                                          new_off + msize));
14995   mem_3 = change_address (mem_3, VOIDmode,
14996                           plus_constant (DImode,
14997                                          operands[8],
14998                                          new_off + msize * 2));
14999   mem_4 = change_address (mem_4, VOIDmode,
15000                           plus_constant (DImode,
15001                                          operands[8],
15002                                          new_off + msize * 3));
15003
15004   if (code == ZERO_EXTEND)
15005     {
15006       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15007       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15008       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15009       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15010     }
15011   else if (code == SIGN_EXTEND)
15012     {
15013       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15014       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15015       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15016       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15017     }
15018
15019   if (load)
15020     {
15021       operands[1] = mem_1;
15022       operands[3] = mem_2;
15023       operands[5] = mem_3;
15024       operands[7] = mem_4;
15025     }
15026   else
15027     {
15028       operands[0] = mem_1;
15029       operands[2] = mem_2;
15030       operands[4] = mem_3;
15031       operands[6] = mem_4;
15032     }
15033
15034   /* Emit adjusting instruction.  */
15035   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15036   /* Emit ldp/stp instructions.  */
15037   t1 = gen_rtx_SET (operands[0], operands[1]);
15038   t2 = gen_rtx_SET (operands[2], operands[3]);
15039   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15040   t1 = gen_rtx_SET (operands[4], operands[5]);
15041   t2 = gen_rtx_SET (operands[6], operands[7]);
15042   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15043   return true;
15044 }
15045
15046 /* Return 1 if pseudo register should be created and used to hold
15047    GOT address for PIC code.  */
15048
15049 bool
15050 aarch64_use_pseudo_pic_reg (void)
15051 {
15052   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15053 }
15054
15055 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15056
15057 static int
15058 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15059 {
15060   switch (XINT (x, 1))
15061     {
15062     case UNSPEC_GOTSMALLPIC:
15063     case UNSPEC_GOTSMALLPIC28K:
15064     case UNSPEC_GOTTINYPIC:
15065       return 0;
15066     default:
15067       break;
15068     }
15069
15070   return default_unspec_may_trap_p (x, flags);
15071 }
15072
15073
15074 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15075    return the log2 of that value.  Otherwise return -1.  */
15076
15077 int
15078 aarch64_fpconst_pow_of_2 (rtx x)
15079 {
15080   const REAL_VALUE_TYPE *r;
15081
15082   if (!CONST_DOUBLE_P (x))
15083     return -1;
15084
15085   r = CONST_DOUBLE_REAL_VALUE (x);
15086
15087   if (REAL_VALUE_NEGATIVE (*r)
15088       || REAL_VALUE_ISNAN (*r)
15089       || REAL_VALUE_ISINF (*r)
15090       || !real_isinteger (r, DFmode))
15091     return -1;
15092
15093   return exact_log2 (real_to_integer (r));
15094 }
15095
15096 /* If X is a vector of equal CONST_DOUBLE values and that value is
15097    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15098
15099 int
15100 aarch64_vec_fpconst_pow_of_2 (rtx x)
15101 {
15102   if (GET_CODE (x) != CONST_VECTOR)
15103     return -1;
15104
15105   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15106     return -1;
15107
15108   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15109   if (firstval <= 0)
15110     return -1;
15111
15112   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15113     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15114       return -1;
15115
15116   return firstval;
15117 }
15118
15119 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15120    to float.
15121
15122    __fp16 always promotes through this hook.
15123    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15124    through the generic excess precision logic rather than here.  */
15125
15126 static tree
15127 aarch64_promoted_type (const_tree t)
15128 {
15129   if (SCALAR_FLOAT_TYPE_P (t)
15130       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15131     return float_type_node;
15132
15133   return NULL_TREE;
15134 }
15135
15136 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15137
15138 static bool
15139 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15140                            optimization_type opt_type)
15141 {
15142   switch (op)
15143     {
15144     case rsqrt_optab:
15145       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15146
15147     default:
15148       return true;
15149     }
15150 }
15151
15152 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15153    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15154
15155 static bool
15156 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15157 {
15158   return (mode == HFmode
15159           ? true
15160           : default_libgcc_floating_mode_supported_p (mode));
15161 }
15162
15163 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15164    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15165
15166 static bool
15167 aarch64_scalar_mode_supported_p (scalar_mode mode)
15168 {
15169   return (mode == HFmode
15170           ? true
15171           : default_scalar_mode_supported_p (mode));
15172 }
15173
15174 /* Set the value of FLT_EVAL_METHOD.
15175    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15176
15177     0: evaluate all operations and constants, whose semantic type has at
15178        most the range and precision of type float, to the range and
15179        precision of float; evaluate all other operations and constants to
15180        the range and precision of the semantic type;
15181
15182     N, where _FloatN is a supported interchange floating type
15183        evaluate all operations and constants, whose semantic type has at
15184        most the range and precision of _FloatN type, to the range and
15185        precision of the _FloatN type; evaluate all other operations and
15186        constants to the range and precision of the semantic type;
15187
15188    If we have the ARMv8.2-A extensions then we support _Float16 in native
15189    precision, so we should set this to 16.  Otherwise, we support the type,
15190    but want to evaluate expressions in float precision, so set this to
15191    0.  */
15192
15193 static enum flt_eval_method
15194 aarch64_excess_precision (enum excess_precision_type type)
15195 {
15196   switch (type)
15197     {
15198       case EXCESS_PRECISION_TYPE_FAST:
15199       case EXCESS_PRECISION_TYPE_STANDARD:
15200         /* We can calculate either in 16-bit range and precision or
15201            32-bit range and precision.  Make that decision based on whether
15202            we have native support for the ARMv8.2-A 16-bit floating-point
15203            instructions or not.  */
15204         return (TARGET_FP_F16INST
15205                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15206                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15207       case EXCESS_PRECISION_TYPE_IMPLICIT:
15208         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15209       default:
15210         gcc_unreachable ();
15211     }
15212   return FLT_EVAL_METHOD_UNPREDICTABLE;
15213 }
15214
15215 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15216    scheduled for speculative execution.  Reject the long-running division
15217    and square-root instructions.  */
15218
15219 static bool
15220 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15221 {
15222   switch (get_attr_type (insn))
15223     {
15224       case TYPE_SDIV:
15225       case TYPE_UDIV:
15226       case TYPE_FDIVS:
15227       case TYPE_FDIVD:
15228       case TYPE_FSQRTS:
15229       case TYPE_FSQRTD:
15230       case TYPE_NEON_FP_SQRT_S:
15231       case TYPE_NEON_FP_SQRT_D:
15232       case TYPE_NEON_FP_SQRT_S_Q:
15233       case TYPE_NEON_FP_SQRT_D_Q:
15234       case TYPE_NEON_FP_DIV_S:
15235       case TYPE_NEON_FP_DIV_D:
15236       case TYPE_NEON_FP_DIV_S_Q:
15237       case TYPE_NEON_FP_DIV_D_Q:
15238         return false;
15239       default:
15240         return true;
15241     }
15242 }
15243
15244 /* Target-specific selftests.  */
15245
15246 #if CHECKING_P
15247
15248 namespace selftest {
15249
15250 /* Selftest for the RTL loader.
15251    Verify that the RTL loader copes with a dump from
15252    print_rtx_function.  This is essentially just a test that class
15253    function_reader can handle a real dump, but it also verifies
15254    that lookup_reg_by_dump_name correctly handles hard regs.
15255    The presence of hard reg names in the dump means that the test is
15256    target-specific, hence it is in this file.  */
15257
15258 static void
15259 aarch64_test_loading_full_dump ()
15260 {
15261   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15262
15263   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15264
15265   rtx_insn *insn_1 = get_insn_by_uid (1);
15266   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15267
15268   rtx_insn *insn_15 = get_insn_by_uid (15);
15269   ASSERT_EQ (INSN, GET_CODE (insn_15));
15270   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15271
15272   /* Verify crtl->return_rtx.  */
15273   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15274   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15275   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15276 }
15277
15278 /* Run all target-specific selftests.  */
15279
15280 static void
15281 aarch64_run_selftests (void)
15282 {
15283   aarch64_test_loading_full_dump ();
15284 }
15285
15286 } // namespace selftest
15287
15288 #endif /* #if CHECKING_P */
15289
15290 #undef TARGET_ADDRESS_COST
15291 #define TARGET_ADDRESS_COST aarch64_address_cost
15292
15293 /* This hook will determines whether unnamed bitfields affect the alignment
15294    of the containing structure.  The hook returns true if the structure
15295    should inherit the alignment requirements of an unnamed bitfield's
15296    type.  */
15297 #undef TARGET_ALIGN_ANON_BITFIELD
15298 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15299
15300 #undef TARGET_ASM_ALIGNED_DI_OP
15301 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15302
15303 #undef TARGET_ASM_ALIGNED_HI_OP
15304 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15305
15306 #undef TARGET_ASM_ALIGNED_SI_OP
15307 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15308
15309 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15310 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15311   hook_bool_const_tree_hwi_hwi_const_tree_true
15312
15313 #undef TARGET_ASM_FILE_START
15314 #define TARGET_ASM_FILE_START aarch64_start_file
15315
15316 #undef TARGET_ASM_OUTPUT_MI_THUNK
15317 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15318
15319 #undef TARGET_ASM_SELECT_RTX_SECTION
15320 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15321
15322 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15323 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15324
15325 #undef TARGET_BUILD_BUILTIN_VA_LIST
15326 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15327
15328 #undef TARGET_CALLEE_COPIES
15329 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15330
15331 #undef TARGET_CAN_ELIMINATE
15332 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15333
15334 #undef TARGET_CAN_INLINE_P
15335 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15336
15337 #undef TARGET_CANNOT_FORCE_CONST_MEM
15338 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15339
15340 #undef TARGET_CASE_VALUES_THRESHOLD
15341 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15342
15343 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15344 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15345
15346 /* Only the least significant bit is used for initialization guard
15347    variables.  */
15348 #undef TARGET_CXX_GUARD_MASK_BIT
15349 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15350
15351 #undef TARGET_C_MODE_FOR_SUFFIX
15352 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15353
15354 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15355 #undef  TARGET_DEFAULT_TARGET_FLAGS
15356 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15357 #endif
15358
15359 #undef TARGET_CLASS_MAX_NREGS
15360 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15361
15362 #undef TARGET_BUILTIN_DECL
15363 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15364
15365 #undef TARGET_BUILTIN_RECIPROCAL
15366 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15367
15368 #undef TARGET_C_EXCESS_PRECISION
15369 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15370
15371 #undef  TARGET_EXPAND_BUILTIN
15372 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15373
15374 #undef TARGET_EXPAND_BUILTIN_VA_START
15375 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15376
15377 #undef TARGET_FOLD_BUILTIN
15378 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15379
15380 #undef TARGET_FUNCTION_ARG
15381 #define TARGET_FUNCTION_ARG aarch64_function_arg
15382
15383 #undef TARGET_FUNCTION_ARG_ADVANCE
15384 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15385
15386 #undef TARGET_FUNCTION_ARG_BOUNDARY
15387 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15388
15389 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15390 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15391
15392 #undef TARGET_FUNCTION_VALUE
15393 #define TARGET_FUNCTION_VALUE aarch64_function_value
15394
15395 #undef TARGET_FUNCTION_VALUE_REGNO_P
15396 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15397
15398 #undef TARGET_FRAME_POINTER_REQUIRED
15399 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15400
15401 #undef TARGET_GIMPLE_FOLD_BUILTIN
15402 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15403
15404 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15405 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15406
15407 #undef  TARGET_INIT_BUILTINS
15408 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15409
15410 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15411 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15412   aarch64_ira_change_pseudo_allocno_class
15413
15414 #undef TARGET_LEGITIMATE_ADDRESS_P
15415 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15416
15417 #undef TARGET_LEGITIMATE_CONSTANT_P
15418 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15419
15420 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15421 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15422   aarch64_legitimize_address_displacement
15423
15424 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15425 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15426
15427 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15428 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15429 aarch64_libgcc_floating_mode_supported_p
15430
15431 #undef TARGET_MANGLE_TYPE
15432 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15433
15434 #undef TARGET_MEMORY_MOVE_COST
15435 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15436
15437 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15438 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15439
15440 #undef TARGET_MUST_PASS_IN_STACK
15441 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15442
15443 /* This target hook should return true if accesses to volatile bitfields
15444    should use the narrowest mode possible.  It should return false if these
15445    accesses should use the bitfield container type.  */
15446 #undef TARGET_NARROW_VOLATILE_BITFIELD
15447 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15448
15449 #undef  TARGET_OPTION_OVERRIDE
15450 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15451
15452 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15453 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15454   aarch64_override_options_after_change
15455
15456 #undef TARGET_OPTION_SAVE
15457 #define TARGET_OPTION_SAVE aarch64_option_save
15458
15459 #undef TARGET_OPTION_RESTORE
15460 #define TARGET_OPTION_RESTORE aarch64_option_restore
15461
15462 #undef TARGET_OPTION_PRINT
15463 #define TARGET_OPTION_PRINT aarch64_option_print
15464
15465 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15466 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15467
15468 #undef TARGET_SET_CURRENT_FUNCTION
15469 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15470
15471 #undef TARGET_PASS_BY_REFERENCE
15472 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15473
15474 #undef TARGET_PREFERRED_RELOAD_CLASS
15475 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15476
15477 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15478 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15479
15480 #undef TARGET_PROMOTED_TYPE
15481 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15482
15483 #undef TARGET_SECONDARY_RELOAD
15484 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15485
15486 #undef TARGET_SHIFT_TRUNCATION_MASK
15487 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15488
15489 #undef TARGET_SETUP_INCOMING_VARARGS
15490 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15491
15492 #undef TARGET_STRUCT_VALUE_RTX
15493 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15494
15495 #undef TARGET_REGISTER_MOVE_COST
15496 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15497
15498 #undef TARGET_RETURN_IN_MEMORY
15499 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15500
15501 #undef TARGET_RETURN_IN_MSB
15502 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15503
15504 #undef TARGET_RTX_COSTS
15505 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15506
15507 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15508 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15509
15510 #undef TARGET_SCHED_ISSUE_RATE
15511 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15512
15513 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15514 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15515   aarch64_sched_first_cycle_multipass_dfa_lookahead
15516
15517 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15518 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15519   aarch64_first_cycle_multipass_dfa_lookahead_guard
15520
15521 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15522 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15523   aarch64_get_separate_components
15524
15525 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15526 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15527   aarch64_components_for_bb
15528
15529 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15530 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15531   aarch64_disqualify_components
15532
15533 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15534 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15535   aarch64_emit_prologue_components
15536
15537 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15538 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15539   aarch64_emit_epilogue_components
15540
15541 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15542 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15543   aarch64_set_handled_components
15544
15545 #undef TARGET_TRAMPOLINE_INIT
15546 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15547
15548 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15549 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15550
15551 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15552 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15553
15554 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15555 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15556   aarch64_builtin_support_vector_misalignment
15557
15558 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15559 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15560
15561 #undef TARGET_VECTORIZE_ADD_STMT_COST
15562 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15563
15564 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15565 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15566   aarch64_builtin_vectorization_cost
15567
15568 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15569 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15570
15571 #undef TARGET_VECTORIZE_BUILTINS
15572 #define TARGET_VECTORIZE_BUILTINS
15573
15574 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15575 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15576   aarch64_builtin_vectorized_function
15577
15578 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15579 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15580   aarch64_autovectorize_vector_sizes
15581
15582 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15583 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15584   aarch64_atomic_assign_expand_fenv
15585
15586 /* Section anchor support.  */
15587
15588 #undef TARGET_MIN_ANCHOR_OFFSET
15589 #define TARGET_MIN_ANCHOR_OFFSET -256
15590
15591 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15592    byte offset; we can do much more for larger data types, but have no way
15593    to determine the size of the access.  We assume accesses are aligned.  */
15594 #undef TARGET_MAX_ANCHOR_OFFSET
15595 #define TARGET_MAX_ANCHOR_OFFSET 4095
15596
15597 #undef TARGET_VECTOR_ALIGNMENT
15598 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15599
15600 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15601 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15602   aarch64_simd_vector_alignment_reachable
15603
15604 /* vec_perm support.  */
15605
15606 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15607 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15608   aarch64_vectorize_vec_perm_const_ok
15609
15610 #undef TARGET_INIT_LIBFUNCS
15611 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15612
15613 #undef TARGET_FIXED_CONDITION_CODE_REGS
15614 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15615
15616 #undef TARGET_FLAGS_REGNUM
15617 #define TARGET_FLAGS_REGNUM CC_REGNUM
15618
15619 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15620 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15621
15622 #undef TARGET_ASAN_SHADOW_OFFSET
15623 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15624
15625 #undef TARGET_LEGITIMIZE_ADDRESS
15626 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15627
15628 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15629 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15630   aarch64_use_by_pieces_infrastructure_p
15631
15632 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15633 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15634
15635 #undef TARGET_CAN_USE_DOLOOP_P
15636 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15637
15638 #undef TARGET_SCHED_ADJUST_PRIORITY
15639 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15640
15641 #undef TARGET_SCHED_MACRO_FUSION_P
15642 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15643
15644 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15645 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15646
15647 #undef TARGET_SCHED_FUSION_PRIORITY
15648 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15649
15650 #undef TARGET_UNSPEC_MAY_TRAP_P
15651 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15652
15653 #undef TARGET_USE_PSEUDO_PIC_REG
15654 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15655
15656 #undef TARGET_PRINT_OPERAND
15657 #define TARGET_PRINT_OPERAND aarch64_print_operand
15658
15659 #undef TARGET_PRINT_OPERAND_ADDRESS
15660 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15661
15662 #undef TARGET_OPTAB_SUPPORTED_P
15663 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15664
15665 #undef TARGET_OMIT_STRUCT_RETURN_REG
15666 #define TARGET_OMIT_STRUCT_RETURN_REG true
15667
15668 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15669 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15670 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15671
15672 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15673 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15674   aarch64_hard_regno_call_part_clobbered
15675
15676 #if CHECKING_P
15677 #undef TARGET_RUN_TARGET_SELFTESTS
15678 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15679 #endif /* #if CHECKING_P */
15680
15681 struct gcc_target targetm = TARGET_INITIALIZER;
15682
15683 #include "gt-aarch64.h"