gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (machine_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 static const struct tune_params thunderx2t99_tunings =
 826 {
 827   &thunderx2t99_extra_costs,
 828   &thunderx2t99_addrcost_table,
 829   &thunderx2t99_regmove_cost,
 830   &thunderx2t99_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   4, /* memmov_cost.  */
 834   4, /* issue_rate.  */
 835   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 836    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 837   16,   /* function_align.  */
 838   8,    /* jump_align.  */
 839   16,   /* loop_align.  */
 840   3,    /* int_reassoc_width.  */
 841   2,    /* fp_reassoc_width.  */
 842   2,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &thunderx2t99_prefetch_tune
 849 };
 850
 851 /* Support for fine-grained override of the tuning structures.  */
 852 struct aarch64_tuning_override_function
 853 {
 854   const char* name;
 855   void (*parse_override)(const char*, struct tune_params*);
 856 };
 857
 858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 860
 861 static const struct aarch64_tuning_override_function
 862 aarch64_tuning_override_functions[] =
 863 {
 864   { "fuse", aarch64_parse_fuse_string },
 865   { "tune", aarch64_parse_tune_string },
 866   { NULL, NULL }
 867 };
 868
 869 /* A processor implementing AArch64.  */
 870 struct processor
 871 {
 872   const char *const name;
 873   enum aarch64_processor ident;
 874   enum aarch64_processor sched_core;
 875   enum aarch64_arch arch;
 876   unsigned architecture_version;
 877   const unsigned long flags;
 878   const struct tune_params *const tune;
 879 };
 880
 881 /* Architectures implementing AArch64.  */
 882 static const struct processor all_architectures[] =
 883 {
 884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 885   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 886 #include "aarch64-arches.def"
 887   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 888 };
 889
 890 /* Processor cores implementing AArch64.  */
 891 static const struct processor all_cores[] =
 892 {
 893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 894   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 895   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 896   FLAGS, &COSTS##_tunings},
 897 #include "aarch64-cores.def"
 898   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 899     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 900   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 901 };
 902
 903
 904 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 905    handling code or by target attributes.  */
 906 static const struct processor *selected_arch;
 907 static const struct processor *selected_cpu;
 908 static const struct processor *selected_tune;
 909
 910 /* The current tuning set.  */
 911 struct tune_params aarch64_tune_params = generic_tunings;
 912
 913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 914
 915 /* An ISA extension in the co-processor and main instruction set space.  */
 916 struct aarch64_option_extension
 917 {
 918   const char *const name;
 919   const unsigned long flags_on;
 920   const unsigned long flags_off;
 921 };
 922
 923 typedef enum aarch64_cond_code
 924 {
 925   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 926   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 927   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 928 }
 929 aarch64_cc;
 930
 931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 932
 933 /* The condition codes of the processor, and the inverse function.  */
 934 static const char * const aarch64_condition_codes[] =
 935 {
 936   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 937   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 938 };
 939
 940 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 941 const char *
 942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 943                         const char * branch_format)
 944 {
 945     rtx_code_label * tmp_label = gen_label_rtx ();
 946     char label_buf[256];
 947     char buffer[128];
 948     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 949                                  CODE_LABEL_NUMBER (tmp_label));
 950     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 951     rtx dest_label = operands[pos_label];
 952     operands[pos_label] = tmp_label;
 953
 954     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 955     output_asm_insn (buffer, operands);
 956
 957     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 958     operands[pos_label] = dest_label;
 959     output_asm_insn (buffer, operands);
 960     return "";
 961 }
 962
 963 void
 964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 965 {
 966   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 967   if (TARGET_GENERAL_REGS_ONLY)
 968     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 969   else
 970     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 971 }
 972
 973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 974    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 975    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 976    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 977    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 978    irrespectively of its cost results in bad allocations with many redundant
 979    int<->FP moves which are expensive on various cores.
 980    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 981    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 982    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 983    Otherwise set the allocno class depending on the mode.
 984    The result of this is that it is no longer inefficient to have a higher
 985    memory move cost than the register move cost.
 986 */
 987
 988 static reg_class_t
 989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 990                                          reg_class_t best_class)
 991 {
 992   machine_mode mode;
 993
 994   if (allocno_class != ALL_REGS)
 995     return allocno_class;
 996
 997   if (best_class != ALL_REGS)
 998     return best_class;
 999
1000   mode = PSEUDO_REGNO_MODE (regno);
1001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007   if (GET_MODE_UNIT_SIZE (mode) == 4)
1008     return aarch64_tune_params.min_div_recip_mul_sf;
1009   return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014                              machine_mode mode)
1015 {
1016   if (VECTOR_MODE_P (mode))
1017     return aarch64_tune_params.vec_reassoc_width;
1018   if (INTEGRAL_MODE_P (mode))
1019     return aarch64_tune_params.int_reassoc_width;
1020   if (FLOAT_MODE_P (mode))
1021     return aarch64_tune_params.fp_reassoc_width;
1022   return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029    if (GP_REGNUM_P (regno))
1030      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031    else if (regno == SP_REGNUM)
1032      return AARCH64_DWARF_SP;
1033    else if (FP_REGNUM_P (regno))
1034      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037       equivalent DWARF register.  */
1038    return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes.  */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045   return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes.  */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052   return aarch64_vector_mode_supported_p (mode)
1053          || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059                                 unsigned HOST_WIDE_INT nelems)
1060 {
1061   if (TARGET_SIMD
1062       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064       && (nelems >= 2 && nelems <= 4))
1065     return true;
1066
1067   return false;
1068 }
1069
1070 /* Implement HARD_REGNO_NREGS.  */
1071
1072 int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075   switch (aarch64_regno_regclass (regno))
1076     {
1077     case FP_REGS:
1078     case FP_LO_REGS:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080     default:
1081       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082     }
1083   gcc_unreachable ();
1084 }
1085
1086 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1087
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091   if (GET_MODE_CLASS (mode) == MODE_CC)
1092     return regno == CC_REGNUM;
1093
1094   if (regno == SP_REGNUM)
1095     /* The purpose of comparing with ptr_mode is to support the
1096        global register variable associated with the stack pointer
1097        register via the syntax of asm ("wsp") in ILP32.  */
1098     return mode == Pmode || mode == ptr_mode;
1099
1100   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101     return mode == Pmode;
1102
1103   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104     return true;
1105
1106   if (FP_REGNUM_P (regno))
1107     {
1108       if (aarch64_vect_struct_mode_p (mode))
1109         return
1110           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111       else
1112         return true;
1113     }
1114
1115   return false;
1116 }
1117
1118 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1119    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1120    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1121
1122 static bool
1123 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1124 {
1125   return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1126 }
1127
1128 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1129 machine_mode
1130 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1131                                      machine_mode mode)
1132 {
1133   /* Handle modes that fit within single registers.  */
1134   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1135     {
1136       if (GET_MODE_SIZE (mode) >= 4)
1137         return mode;
1138       else
1139         return SImode;
1140     }
1141   /* Fall back to generic for multi-reg and very large modes.  */
1142   else
1143     return choose_hard_reg_mode (regno, nregs, false);
1144 }
1145
1146 /* Return true if calls to DECL should be treated as
1147    long-calls (ie called via a register).  */
1148 static bool
1149 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1150 {
1151   return false;
1152 }
1153
1154 /* Return true if calls to symbol-ref SYM should be treated as
1155    long-calls (ie called via a register).  */
1156 bool
1157 aarch64_is_long_call_p (rtx sym)
1158 {
1159   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1160 }
1161
1162 /* Return true if calls to symbol-ref SYM should not go through
1163    plt stubs.  */
1164
1165 bool
1166 aarch64_is_noplt_call_p (rtx sym)
1167 {
1168   const_tree decl = SYMBOL_REF_DECL (sym);
1169
1170   if (flag_pic
1171       && decl
1172       && (!flag_plt
1173           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1174       && !targetm.binds_local_p (decl))
1175     return true;
1176
1177   return false;
1178 }
1179
1180 /* Return true if the offsets to a zero/sign-extract operation
1181    represent an expression that matches an extend operation.  The
1182    operands represent the paramters from
1183
1184    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1185 bool
1186 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1187                                 rtx extract_imm)
1188 {
1189   HOST_WIDE_INT mult_val, extract_val;
1190
1191   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1192     return false;
1193
1194   mult_val = INTVAL (mult_imm);
1195   extract_val = INTVAL (extract_imm);
1196
1197   if (extract_val > 8
1198       && extract_val < GET_MODE_BITSIZE (mode)
1199       && exact_log2 (extract_val & ~7) > 0
1200       && (extract_val & 7) <= 4
1201       && mult_val == (1 << (extract_val & 7)))
1202     return true;
1203
1204   return false;
1205 }
1206
1207 /* Emit an insn that's a simple single-set.  Both the operands must be
1208    known to be valid.  */
1209 inline static rtx_insn *
1210 emit_set_insn (rtx x, rtx y)
1211 {
1212   return emit_insn (gen_rtx_SET (x, y));
1213 }
1214
1215 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1216    return the rtx for register 0 in the proper mode.  */
1217 rtx
1218 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1219 {
1220   machine_mode mode = SELECT_CC_MODE (code, x, y);
1221   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1222
1223   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1224   return cc_reg;
1225 }
1226
1227 /* Build the SYMBOL_REF for __tls_get_addr.  */
1228
1229 static GTY(()) rtx tls_get_addr_libfunc;
1230
1231 rtx
1232 aarch64_tls_get_addr (void)
1233 {
1234   if (!tls_get_addr_libfunc)
1235     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1236   return tls_get_addr_libfunc;
1237 }
1238
1239 /* Return the TLS model to use for ADDR.  */
1240
1241 static enum tls_model
1242 tls_symbolic_operand_type (rtx addr)
1243 {
1244   enum tls_model tls_kind = TLS_MODEL_NONE;
1245   rtx sym, addend;
1246
1247   if (GET_CODE (addr) == CONST)
1248     {
1249       split_const (addr, &sym, &addend);
1250       if (GET_CODE (sym) == SYMBOL_REF)
1251         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1252     }
1253   else if (GET_CODE (addr) == SYMBOL_REF)
1254     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1255
1256   return tls_kind;
1257 }
1258
1259 /* We'll allow lo_sum's in addresses in our legitimate addresses
1260    so that combine would take care of combining addresses where
1261    necessary, but for generation purposes, we'll generate the address
1262    as :
1263    RTL                               Absolute
1264    tmp = hi (symbol_ref);            adrp  x1, foo
1265    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1266                                      nop
1267
1268    PIC                               TLS
1269    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1270    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1271                                      bl   __tls_get_addr
1272                                      nop
1273
1274    Load TLS symbol, depending on TLS mechanism and TLS access model.
1275
1276    Global Dynamic - Traditional TLS:
1277    adrp tmp, :tlsgd:imm
1278    add  dest, tmp, #:tlsgd_lo12:imm
1279    bl   __tls_get_addr
1280
1281    Global Dynamic - TLS Descriptors:
1282    adrp dest, :tlsdesc:imm
1283    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1284    add  dest, dest, #:tlsdesc_lo12:imm
1285    blr  tmp
1286    mrs  tp, tpidr_el0
1287    add  dest, dest, tp
1288
1289    Initial Exec:
1290    mrs  tp, tpidr_el0
1291    adrp tmp, :gottprel:imm
1292    ldr  dest, [tmp, #:gottprel_lo12:imm]
1293    add  dest, dest, tp
1294
1295    Local Exec:
1296    mrs  tp, tpidr_el0
1297    add  t0, tp, #:tprel_hi12:imm, lsl #12
1298    add  t0, t0, #:tprel_lo12_nc:imm
1299 */
1300
1301 static void
1302 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1303                                    enum aarch64_symbol_type type)
1304 {
1305   switch (type)
1306     {
1307     case SYMBOL_SMALL_ABSOLUTE:
1308       {
1309         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1310         rtx tmp_reg = dest;
1311         machine_mode mode = GET_MODE (dest);
1312
1313         gcc_assert (mode == Pmode || mode == ptr_mode);
1314
1315         if (can_create_pseudo_p ())
1316           tmp_reg = gen_reg_rtx (mode);
1317
1318         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1319         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1320         return;
1321       }
1322
1323     case SYMBOL_TINY_ABSOLUTE:
1324       emit_insn (gen_rtx_SET (dest, imm));
1325       return;
1326
1327     case SYMBOL_SMALL_GOT_28K:
1328       {
1329         machine_mode mode = GET_MODE (dest);
1330         rtx gp_rtx = pic_offset_table_rtx;
1331         rtx insn;
1332         rtx mem;
1333
1334         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1335            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1336            decide rtx costs, in which case pic_offset_table_rtx is not
1337            initialized.  For that case no need to generate the first adrp
1338            instruction as the final cost for global variable access is
1339            one instruction.  */
1340         if (gp_rtx != NULL)
1341           {
1342             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1343                using the page base as GOT base, the first page may be wasted,
1344                in the worst scenario, there is only 28K space for GOT).
1345
1346                The generate instruction sequence for accessing global variable
1347                is:
1348
1349                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1350
1351                Only one instruction needed. But we must initialize
1352                pic_offset_table_rtx properly.  We generate initialize insn for
1353                every global access, and allow CSE to remove all redundant.
1354
1355                The final instruction sequences will look like the following
1356                for multiply global variables access.
1357
1358                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1359
1360                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1361                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1362                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1363                  ...  */
1364
1365             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1366             crtl->uses_pic_offset_table = 1;
1367             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1368
1369             if (mode != GET_MODE (gp_rtx))
1370              gp_rtx = gen_lowpart (mode, gp_rtx);
1371
1372           }
1373
1374         if (mode == ptr_mode)
1375           {
1376             if (mode == DImode)
1377               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1378             else
1379               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1380
1381             mem = XVECEXP (SET_SRC (insn), 0, 0);
1382           }
1383         else
1384           {
1385             gcc_assert (mode == Pmode);
1386
1387             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1388             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1389           }
1390
1391         /* The operand is expected to be MEM.  Whenever the related insn
1392            pattern changed, above code which calculate mem should be
1393            updated.  */
1394         gcc_assert (GET_CODE (mem) == MEM);
1395         MEM_READONLY_P (mem) = 1;
1396         MEM_NOTRAP_P (mem) = 1;
1397         emit_insn (insn);
1398         return;
1399       }
1400
1401     case SYMBOL_SMALL_GOT_4G:
1402       {
1403         /* In ILP32, the mode of dest can be either SImode or DImode,
1404            while the got entry is always of SImode size.  The mode of
1405            dest depends on how dest is used: if dest is assigned to a
1406            pointer (e.g. in the memory), it has SImode; it may have
1407            DImode if dest is dereferenced to access the memeory.
1408            This is why we have to handle three different ldr_got_small
1409            patterns here (two patterns for ILP32).  */
1410
1411         rtx insn;
1412         rtx mem;
1413         rtx tmp_reg = dest;
1414         machine_mode mode = GET_MODE (dest);
1415
1416         if (can_create_pseudo_p ())
1417           tmp_reg = gen_reg_rtx (mode);
1418
1419         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1420         if (mode == ptr_mode)
1421           {
1422             if (mode == DImode)
1423               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1424             else
1425               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1426
1427             mem = XVECEXP (SET_SRC (insn), 0, 0);
1428           }
1429         else
1430           {
1431             gcc_assert (mode == Pmode);
1432
1433             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1434             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1435           }
1436
1437         gcc_assert (GET_CODE (mem) == MEM);
1438         MEM_READONLY_P (mem) = 1;
1439         MEM_NOTRAP_P (mem) = 1;
1440         emit_insn (insn);
1441         return;
1442       }
1443
1444     case SYMBOL_SMALL_TLSGD:
1445       {
1446         rtx_insn *insns;
1447         machine_mode mode = GET_MODE (dest);
1448         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1449
1450         start_sequence ();
1451         if (TARGET_ILP32)
1452           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1453         else
1454           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1455         insns = get_insns ();
1456         end_sequence ();
1457
1458         RTL_CONST_CALL_P (insns) = 1;
1459         emit_libcall_block (insns, dest, result, imm);
1460         return;
1461       }
1462
1463     case SYMBOL_SMALL_TLSDESC:
1464       {
1465         machine_mode mode = GET_MODE (dest);
1466         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1467         rtx tp;
1468
1469         gcc_assert (mode == Pmode || mode == ptr_mode);
1470
1471         /* In ILP32, the got entry is always of SImode size.  Unlike
1472            small GOT, the dest is fixed at reg 0.  */
1473         if (TARGET_ILP32)
1474           emit_insn (gen_tlsdesc_small_si (imm));
1475         else
1476           emit_insn (gen_tlsdesc_small_di (imm));
1477         tp = aarch64_load_tp (NULL);
1478
1479         if (mode != Pmode)
1480           tp = gen_lowpart (mode, tp);
1481
1482         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1483         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1484         return;
1485       }
1486
1487     case SYMBOL_SMALL_TLSIE:
1488       {
1489         /* In ILP32, the mode of dest can be either SImode or DImode,
1490            while the got entry is always of SImode size.  The mode of
1491            dest depends on how dest is used: if dest is assigned to a
1492            pointer (e.g. in the memory), it has SImode; it may have
1493            DImode if dest is dereferenced to access the memeory.
1494            This is why we have to handle three different tlsie_small
1495            patterns here (two patterns for ILP32).  */
1496         machine_mode mode = GET_MODE (dest);
1497         rtx tmp_reg = gen_reg_rtx (mode);
1498         rtx tp = aarch64_load_tp (NULL);
1499
1500         if (mode == ptr_mode)
1501           {
1502             if (mode == DImode)
1503               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1504             else
1505               {
1506                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1507                 tp = gen_lowpart (mode, tp);
1508               }
1509           }
1510         else
1511           {
1512             gcc_assert (mode == Pmode);
1513             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1514           }
1515
1516         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1517         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518         return;
1519       }
1520
1521     case SYMBOL_TLSLE12:
1522     case SYMBOL_TLSLE24:
1523     case SYMBOL_TLSLE32:
1524     case SYMBOL_TLSLE48:
1525       {
1526         machine_mode mode = GET_MODE (dest);
1527         rtx tp = aarch64_load_tp (NULL);
1528
1529         if (mode != Pmode)
1530           tp = gen_lowpart (mode, tp);
1531
1532         switch (type)
1533           {
1534           case SYMBOL_TLSLE12:
1535             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1536                         (dest, tp, imm));
1537             break;
1538           case SYMBOL_TLSLE24:
1539             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1540                         (dest, tp, imm));
1541           break;
1542           case SYMBOL_TLSLE32:
1543             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1544                         (dest, imm));
1545             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1546                         (dest, dest, tp));
1547           break;
1548           case SYMBOL_TLSLE48:
1549             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1550                         (dest, imm));
1551             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1552                         (dest, dest, tp));
1553             break;
1554           default:
1555             gcc_unreachable ();
1556           }
1557
1558         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1559         return;
1560       }
1561
1562     case SYMBOL_TINY_GOT:
1563       emit_insn (gen_ldr_got_tiny (dest, imm));
1564       return;
1565
1566     case SYMBOL_TINY_TLSIE:
1567       {
1568         machine_mode mode = GET_MODE (dest);
1569         rtx tp = aarch64_load_tp (NULL);
1570
1571         if (mode == ptr_mode)
1572           {
1573             if (mode == DImode)
1574               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1575             else
1576               {
1577                 tp = gen_lowpart (mode, tp);
1578                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1579               }
1580           }
1581         else
1582           {
1583             gcc_assert (mode == Pmode);
1584             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1585           }
1586
1587         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1588         return;
1589       }
1590
1591     default:
1592       gcc_unreachable ();
1593     }
1594 }
1595
1596 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1597    handle all moves if !can_create_pseudo_p ().  The distinction is
1598    important because, unlike emit_move_insn, the move expanders know
1599    how to force Pmode objects into the constant pool even when the
1600    constant pool address is not itself legitimate.  */
1601 static rtx
1602 aarch64_emit_move (rtx dest, rtx src)
1603 {
1604   return (can_create_pseudo_p ()
1605           ? emit_move_insn (dest, src)
1606           : emit_move_insn_1 (dest, src));
1607 }
1608
1609 /* Split a 128-bit move operation into two 64-bit move operations,
1610    taking care to handle partial overlap of register to register
1611    copies.  Special cases are needed when moving between GP regs and
1612    FP regs.  SRC can be a register, constant or memory; DST a register
1613    or memory.  If either operand is memory it must not have any side
1614    effects.  */
1615 void
1616 aarch64_split_128bit_move (rtx dst, rtx src)
1617 {
1618   rtx dst_lo, dst_hi;
1619   rtx src_lo, src_hi;
1620
1621   machine_mode mode = GET_MODE (dst);
1622
1623   gcc_assert (mode == TImode || mode == TFmode);
1624   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1625   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1626
1627   if (REG_P (dst) && REG_P (src))
1628     {
1629       int src_regno = REGNO (src);
1630       int dst_regno = REGNO (dst);
1631
1632       /* Handle FP <-> GP regs.  */
1633       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1634         {
1635           src_lo = gen_lowpart (word_mode, src);
1636           src_hi = gen_highpart (word_mode, src);
1637
1638           if (mode == TImode)
1639             {
1640               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1641               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1642             }
1643           else
1644             {
1645               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1646               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1647             }
1648           return;
1649         }
1650       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1651         {
1652           dst_lo = gen_lowpart (word_mode, dst);
1653           dst_hi = gen_highpart (word_mode, dst);
1654
1655           if (mode == TImode)
1656             {
1657               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1658               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1659             }
1660           else
1661             {
1662               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1663               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1664             }
1665           return;
1666         }
1667     }
1668
1669   dst_lo = gen_lowpart (word_mode, dst);
1670   dst_hi = gen_highpart (word_mode, dst);
1671   src_lo = gen_lowpart (word_mode, src);
1672   src_hi = gen_highpart_mode (word_mode, mode, src);
1673
1674   /* At most one pairing may overlap.  */
1675   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1676     {
1677       aarch64_emit_move (dst_hi, src_hi);
1678       aarch64_emit_move (dst_lo, src_lo);
1679     }
1680   else
1681     {
1682       aarch64_emit_move (dst_lo, src_lo);
1683       aarch64_emit_move (dst_hi, src_hi);
1684     }
1685 }
1686
1687 bool
1688 aarch64_split_128bit_move_p (rtx dst, rtx src)
1689 {
1690   return (! REG_P (src)
1691           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1692 }
1693
1694 /* Split a complex SIMD combine.  */
1695
1696 void
1697 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1698 {
1699   machine_mode src_mode = GET_MODE (src1);
1700   machine_mode dst_mode = GET_MODE (dst);
1701
1702   gcc_assert (VECTOR_MODE_P (dst_mode));
1703   gcc_assert (register_operand (dst, dst_mode)
1704               && register_operand (src1, src_mode)
1705               && register_operand (src2, src_mode));
1706
1707   rtx (*gen) (rtx, rtx, rtx);
1708
1709   switch (src_mode)
1710     {
1711     case E_V8QImode:
1712       gen = gen_aarch64_simd_combinev8qi;
1713       break;
1714     case E_V4HImode:
1715       gen = gen_aarch64_simd_combinev4hi;
1716       break;
1717     case E_V2SImode:
1718       gen = gen_aarch64_simd_combinev2si;
1719       break;
1720     case E_V4HFmode:
1721       gen = gen_aarch64_simd_combinev4hf;
1722       break;
1723     case E_V2SFmode:
1724       gen = gen_aarch64_simd_combinev2sf;
1725       break;
1726     case E_DImode:
1727       gen = gen_aarch64_simd_combinedi;
1728       break;
1729     case E_DFmode:
1730       gen = gen_aarch64_simd_combinedf;
1731       break;
1732     default:
1733       gcc_unreachable ();
1734     }
1735
1736   emit_insn (gen (dst, src1, src2));
1737   return;
1738 }
1739
1740 /* Split a complex SIMD move.  */
1741
1742 void
1743 aarch64_split_simd_move (rtx dst, rtx src)
1744 {
1745   machine_mode src_mode = GET_MODE (src);
1746   machine_mode dst_mode = GET_MODE (dst);
1747
1748   gcc_assert (VECTOR_MODE_P (dst_mode));
1749
1750   if (REG_P (dst) && REG_P (src))
1751     {
1752       rtx (*gen) (rtx, rtx);
1753
1754       gcc_assert (VECTOR_MODE_P (src_mode));
1755
1756       switch (src_mode)
1757         {
1758         case E_V16QImode:
1759           gen = gen_aarch64_split_simd_movv16qi;
1760           break;
1761         case E_V8HImode:
1762           gen = gen_aarch64_split_simd_movv8hi;
1763           break;
1764         case E_V4SImode:
1765           gen = gen_aarch64_split_simd_movv4si;
1766           break;
1767         case E_V2DImode:
1768           gen = gen_aarch64_split_simd_movv2di;
1769           break;
1770         case E_V8HFmode:
1771           gen = gen_aarch64_split_simd_movv8hf;
1772           break;
1773         case E_V4SFmode:
1774           gen = gen_aarch64_split_simd_movv4sf;
1775           break;
1776         case E_V2DFmode:
1777           gen = gen_aarch64_split_simd_movv2df;
1778           break;
1779         default:
1780           gcc_unreachable ();
1781         }
1782
1783       emit_insn (gen (dst, src));
1784       return;
1785     }
1786 }
1787
1788 bool
1789 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1790                               machine_mode ymode, rtx y)
1791 {
1792   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1793   gcc_assert (r != NULL);
1794   return rtx_equal_p (x, r);
1795 }
1796
1797
1798 static rtx
1799 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1800 {
1801   if (can_create_pseudo_p ())
1802     return force_reg (mode, value);
1803   else
1804     {
1805       x = aarch64_emit_move (x, value);
1806       return x;
1807     }
1808 }
1809
1810
1811 static rtx
1812 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1813 {
1814   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1815     {
1816       rtx high;
1817       /* Load the full offset into a register.  This
1818          might be improvable in the future.  */
1819       high = GEN_INT (offset);
1820       offset = 0;
1821       high = aarch64_force_temporary (mode, temp, high);
1822       reg = aarch64_force_temporary (mode, temp,
1823                                      gen_rtx_PLUS (mode, high, reg));
1824     }
1825   return plus_constant (mode, reg, offset);
1826 }
1827
1828 static int
1829 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1830                                 machine_mode mode)
1831 {
1832   int i;
1833   unsigned HOST_WIDE_INT val, val2, mask;
1834   int one_match, zero_match;
1835   int num_insns;
1836
1837   val = INTVAL (imm);
1838
1839   if (aarch64_move_imm (val, mode))
1840     {
1841       if (generate)
1842         emit_insn (gen_rtx_SET (dest, imm));
1843       return 1;
1844     }
1845
1846   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1847      (with XXXX non-zero). In that case check to see if the move can be done in
1848      a smaller mode.  */
1849   val2 = val & 0xffffffff;
1850   if (mode == DImode
1851       && aarch64_move_imm (val2, SImode)
1852       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1853     {
1854       if (generate)
1855         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1856
1857       /* Check if we have to emit a second instruction by checking to see
1858          if any of the upper 32 bits of the original DI mode value is set.  */
1859       if (val == val2)
1860         return 1;
1861
1862       i = (val >> 48) ? 48 : 32;
1863
1864       if (generate)
1865          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1866                                     GEN_INT ((val >> i) & 0xffff)));
1867
1868       return 2;
1869     }
1870
1871   if ((val >> 32) == 0 || mode == SImode)
1872     {
1873       if (generate)
1874         {
1875           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1876           if (mode == SImode)
1877             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1878                                        GEN_INT ((val >> 16) & 0xffff)));
1879           else
1880             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1881                                        GEN_INT ((val >> 16) & 0xffff)));
1882         }
1883       return 2;
1884     }
1885
1886   /* Remaining cases are all for DImode.  */
1887
1888   mask = 0xffff;
1889   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1890     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1891   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1892     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1893
1894   if (zero_match != 2 && one_match != 2)
1895     {
1896       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1897          For a 64-bit bitmask try whether changing 16 bits to all ones or
1898          zeroes creates a valid bitmask.  To check any repeated bitmask,
1899          try using 16 bits from the other 32-bit half of val.  */
1900
1901       for (i = 0; i < 64; i += 16, mask <<= 16)
1902         {
1903           val2 = val & ~mask;
1904           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1905             break;
1906           val2 = val | mask;
1907           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1908             break;
1909           val2 = val2 & ~mask;
1910           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1911           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1912             break;
1913         }
1914       if (i != 64)
1915         {
1916           if (generate)
1917             {
1918               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1919               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1920                                          GEN_INT ((val >> i) & 0xffff)));
1921             }
1922           return 2;
1923         }
1924     }
1925
1926   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1927      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1928      otherwise skip zero bits.  */
1929
1930   num_insns = 1;
1931   mask = 0xffff;
1932   val2 = one_match > zero_match ? ~val : val;
1933   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1934
1935   if (generate)
1936     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1937                                            ? (val | ~(mask << i))
1938                                            : (val & (mask << i)))));
1939   for (i += 16; i < 64; i += 16)
1940     {
1941       if ((val2 & (mask << i)) == 0)
1942         continue;
1943       if (generate)
1944         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1945                                    GEN_INT ((val >> i) & 0xffff)));
1946       num_insns ++;
1947     }
1948
1949   return num_insns;
1950 }
1951
1952
1953 void
1954 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1955 {
1956   machine_mode mode = GET_MODE (dest);
1957
1958   gcc_assert (mode == SImode || mode == DImode);
1959
1960   /* Check on what type of symbol it is.  */
1961   if (GET_CODE (imm) == SYMBOL_REF
1962       || GET_CODE (imm) == LABEL_REF
1963       || GET_CODE (imm) == CONST)
1964     {
1965       rtx mem, base, offset;
1966       enum aarch64_symbol_type sty;
1967
1968       /* If we have (const (plus symbol offset)), separate out the offset
1969          before we start classifying the symbol.  */
1970       split_const (imm, &base, &offset);
1971
1972       sty = aarch64_classify_symbol (base, offset);
1973       switch (sty)
1974         {
1975         case SYMBOL_FORCE_TO_MEM:
1976           if (offset != const0_rtx
1977               && targetm.cannot_force_const_mem (mode, imm))
1978             {
1979               gcc_assert (can_create_pseudo_p ());
1980               base = aarch64_force_temporary (mode, dest, base);
1981               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1982               aarch64_emit_move (dest, base);
1983               return;
1984             }
1985
1986           mem = force_const_mem (ptr_mode, imm);
1987           gcc_assert (mem);
1988
1989           /* If we aren't generating PC relative literals, then
1990              we need to expand the literal pool access carefully.
1991              This is something that needs to be done in a number
1992              of places, so could well live as a separate function.  */
1993           if (!aarch64_pcrelative_literal_loads)
1994             {
1995               gcc_assert (can_create_pseudo_p ());
1996               base = gen_reg_rtx (ptr_mode);
1997               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1998               if (ptr_mode != Pmode)
1999                 base = convert_memory_address (Pmode, base);
2000               mem = gen_rtx_MEM (ptr_mode, base);
2001             }
2002
2003           if (mode != ptr_mode)
2004             mem = gen_rtx_ZERO_EXTEND (mode, mem);
2005
2006           emit_insn (gen_rtx_SET (dest, mem));
2007
2008           return;
2009
2010         case SYMBOL_SMALL_TLSGD:
2011         case SYMBOL_SMALL_TLSDESC:
2012         case SYMBOL_SMALL_TLSIE:
2013         case SYMBOL_SMALL_GOT_28K:
2014         case SYMBOL_SMALL_GOT_4G:
2015         case SYMBOL_TINY_GOT:
2016         case SYMBOL_TINY_TLSIE:
2017           if (offset != const0_rtx)
2018             {
2019               gcc_assert(can_create_pseudo_p ());
2020               base = aarch64_force_temporary (mode, dest, base);
2021               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2022               aarch64_emit_move (dest, base);
2023               return;
2024             }
2025           /* FALLTHRU */
2026
2027         case SYMBOL_SMALL_ABSOLUTE:
2028         case SYMBOL_TINY_ABSOLUTE:
2029         case SYMBOL_TLSLE12:
2030         case SYMBOL_TLSLE24:
2031         case SYMBOL_TLSLE32:
2032         case SYMBOL_TLSLE48:
2033           aarch64_load_symref_appropriately (dest, imm, sty);
2034           return;
2035
2036         default:
2037           gcc_unreachable ();
2038         }
2039     }
2040
2041   if (!CONST_INT_P (imm))
2042     {
2043       if (GET_CODE (imm) == HIGH)
2044         emit_insn (gen_rtx_SET (dest, imm));
2045       else
2046         {
2047           rtx mem = force_const_mem (mode, imm);
2048           gcc_assert (mem);
2049           emit_insn (gen_rtx_SET (dest, mem));
2050         }
2051
2052       return;
2053     }
2054
2055   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2056 }
2057
2058 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2059    temporary value if necessary.  FRAME_RELATED_P should be true if
2060    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2061    to the generated instructions.  If SCRATCHREG is known to hold
2062    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2063    immediate again.
2064
2065    Since this function may be used to adjust the stack pointer, we must
2066    ensure that it cannot cause transient stack deallocation (for example
2067    by first incrementing SP and then decrementing when adjusting by a
2068    large immediate).  */
2069
2070 static void
2071 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2072                                HOST_WIDE_INT delta, bool frame_related_p,
2073                                bool emit_move_imm)
2074 {
2075   HOST_WIDE_INT mdelta = abs_hwi (delta);
2076   rtx this_rtx = gen_rtx_REG (mode, regnum);
2077   rtx_insn *insn;
2078
2079   if (!mdelta)
2080     return;
2081
2082   /* Single instruction adjustment.  */
2083   if (aarch64_uimm12_shift (mdelta))
2084     {
2085       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2086       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2087       return;
2088     }
2089
2090   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2091      Only do this if mdelta is not a 16-bit move as adjusting using a move
2092      is better.  */
2093   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2094     {
2095       HOST_WIDE_INT low_off = mdelta & 0xfff;
2096
2097       low_off = delta < 0 ? -low_off : low_off;
2098       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2099       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2100       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2101       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2102       return;
2103     }
2104
2105   /* Emit a move immediate if required and an addition/subtraction.  */
2106   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2107   if (emit_move_imm)
2108     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2109   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2110                               : gen_add2_insn (this_rtx, scratch_rtx));
2111   if (frame_related_p)
2112     {
2113       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2114       rtx adj = plus_constant (mode, this_rtx, delta);
2115       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2116     }
2117 }
2118
2119 static inline void
2120 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2121                       HOST_WIDE_INT delta)
2122 {
2123   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2124 }
2125
2126 static inline void
2127 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2128 {
2129   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2130                                  true, emit_move_imm);
2131 }
2132
2133 static inline void
2134 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2135 {
2136   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2137                                  frame_related_p, true);
2138 }
2139
2140 static bool
2141 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2142                                  tree exp ATTRIBUTE_UNUSED)
2143 {
2144   /* Currently, always true.  */
2145   return true;
2146 }
2147
2148 /* Implement TARGET_PASS_BY_REFERENCE.  */
2149
2150 static bool
2151 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2152                            machine_mode mode,
2153                            const_tree type,
2154                            bool named ATTRIBUTE_UNUSED)
2155 {
2156   HOST_WIDE_INT size;
2157   machine_mode dummymode;
2158   int nregs;
2159
2160   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2161   size = (mode == BLKmode && type)
2162     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2163
2164   /* Aggregates are passed by reference based on their size.  */
2165   if (type && AGGREGATE_TYPE_P (type))
2166     {
2167       size = int_size_in_bytes (type);
2168     }
2169
2170   /* Variable sized arguments are always returned by reference.  */
2171   if (size < 0)
2172     return true;
2173
2174   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2175   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2176                                                &dummymode, &nregs,
2177                                                NULL))
2178     return false;
2179
2180   /* Arguments which are variable sized or larger than 2 registers are
2181      passed by reference unless they are a homogenous floating point
2182      aggregate.  */
2183   return size > 2 * UNITS_PER_WORD;
2184 }
2185
2186 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2187 static bool
2188 aarch64_return_in_msb (const_tree valtype)
2189 {
2190   machine_mode dummy_mode;
2191   int dummy_int;
2192
2193   /* Never happens in little-endian mode.  */
2194   if (!BYTES_BIG_ENDIAN)
2195     return false;
2196
2197   /* Only composite types smaller than or equal to 16 bytes can
2198      be potentially returned in registers.  */
2199   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2200       || int_size_in_bytes (valtype) <= 0
2201       || int_size_in_bytes (valtype) > 16)
2202     return false;
2203
2204   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2205      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2206      is always passed/returned in the least significant bits of fp/simd
2207      register(s).  */
2208   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2209                                                &dummy_mode, &dummy_int, NULL))
2210     return false;
2211
2212   return true;
2213 }
2214
2215 /* Implement TARGET_FUNCTION_VALUE.
2216    Define how to find the value returned by a function.  */
2217
2218 static rtx
2219 aarch64_function_value (const_tree type, const_tree func,
2220                         bool outgoing ATTRIBUTE_UNUSED)
2221 {
2222   machine_mode mode;
2223   int unsignedp;
2224   int count;
2225   machine_mode ag_mode;
2226
2227   mode = TYPE_MODE (type);
2228   if (INTEGRAL_TYPE_P (type))
2229     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2230
2231   if (aarch64_return_in_msb (type))
2232     {
2233       HOST_WIDE_INT size = int_size_in_bytes (type);
2234
2235       if (size % UNITS_PER_WORD != 0)
2236         {
2237           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2238           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2239         }
2240     }
2241
2242   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2243                                                &ag_mode, &count, NULL))
2244     {
2245       if (!aarch64_composite_type_p (type, mode))
2246         {
2247           gcc_assert (count == 1 && mode == ag_mode);
2248           return gen_rtx_REG (mode, V0_REGNUM);
2249         }
2250       else
2251         {
2252           int i;
2253           rtx par;
2254
2255           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2256           for (i = 0; i < count; i++)
2257             {
2258               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2259               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2260                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2261               XVECEXP (par, 0, i) = tmp;
2262             }
2263           return par;
2264         }
2265     }
2266   else
2267     return gen_rtx_REG (mode, R0_REGNUM);
2268 }
2269
2270 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2271    Return true if REGNO is the number of a hard register in which the values
2272    of called function may come back.  */
2273
2274 static bool
2275 aarch64_function_value_regno_p (const unsigned int regno)
2276 {
2277   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2278      of 16-byte return values are: 128-bit integers and 16-byte small
2279      structures (excluding homogeneous floating-point aggregates).  */
2280   if (regno == R0_REGNUM || regno == R1_REGNUM)
2281     return true;
2282
2283   /* Up to four fp/simd registers can return a function value, e.g. a
2284      homogeneous floating-point aggregate having four members.  */
2285   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2286     return TARGET_FLOAT;
2287
2288   return false;
2289 }
2290
2291 /* Implement TARGET_RETURN_IN_MEMORY.
2292
2293    If the type T of the result of a function is such that
2294      void func (T arg)
2295    would require that arg be passed as a value in a register (or set of
2296    registers) according to the parameter passing rules, then the result
2297    is returned in the same registers as would be used for such an
2298    argument.  */
2299
2300 static bool
2301 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2302 {
2303   HOST_WIDE_INT size;
2304   machine_mode ag_mode;
2305   int count;
2306
2307   if (!AGGREGATE_TYPE_P (type)
2308       && TREE_CODE (type) != COMPLEX_TYPE
2309       && TREE_CODE (type) != VECTOR_TYPE)
2310     /* Simple scalar types always returned in registers.  */
2311     return false;
2312
2313   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2314                                                type,
2315                                                &ag_mode,
2316                                                &count,
2317                                                NULL))
2318     return false;
2319
2320   /* Types larger than 2 registers returned in memory.  */
2321   size = int_size_in_bytes (type);
2322   return (size < 0 || size > 2 * UNITS_PER_WORD);
2323 }
2324
2325 static bool
2326 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2327                                const_tree type, int *nregs)
2328 {
2329   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2330   return aarch64_vfp_is_call_or_return_candidate (mode,
2331                                                   type,
2332                                                   &pcum->aapcs_vfp_rmode,
2333                                                   nregs,
2334                                                   NULL);
2335 }
2336
2337 /* Given MODE and TYPE of a function argument, return the alignment in
2338    bits.  The idea is to suppress any stronger alignment requested by
2339    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2340    This is a helper function for local use only.  */
2341
2342 static unsigned int
2343 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2344 {
2345   if (!type)
2346     return GET_MODE_ALIGNMENT (mode);
2347
2348   if (integer_zerop (TYPE_SIZE (type)))
2349     return 0;
2350
2351   gcc_assert (TYPE_MODE (type) == mode);
2352
2353   if (!AGGREGATE_TYPE_P (type))
2354     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2355
2356   if (TREE_CODE (type) == ARRAY_TYPE)
2357     return TYPE_ALIGN (TREE_TYPE (type));
2358
2359   unsigned int alignment = 0;
2360   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2361     if (TREE_CODE (field) == FIELD_DECL)
2362       alignment = std::max (alignment, DECL_ALIGN (field));
2363
2364   return alignment;
2365 }
2366
2367 /* Layout a function argument according to the AAPCS64 rules.  The rule
2368    numbers refer to the rule numbers in the AAPCS64.  */
2369
2370 static void
2371 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2372                     const_tree type,
2373                     bool named ATTRIBUTE_UNUSED)
2374 {
2375   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2376   int ncrn, nvrn, nregs;
2377   bool allocate_ncrn, allocate_nvrn;
2378   HOST_WIDE_INT size;
2379
2380   /* We need to do this once per argument.  */
2381   if (pcum->aapcs_arg_processed)
2382     return;
2383
2384   pcum->aapcs_arg_processed = true;
2385
2386   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2387   size
2388     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2389                 UNITS_PER_WORD);
2390
2391   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2392   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2393                                                  mode,
2394                                                  type,
2395                                                  &nregs);
2396
2397   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2398      The following code thus handles passing by SIMD/FP registers first.  */
2399
2400   nvrn = pcum->aapcs_nvrn;
2401
2402   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2403      and homogenous short-vector aggregates (HVA).  */
2404   if (allocate_nvrn)
2405     {
2406       if (!TARGET_FLOAT)
2407         aarch64_err_no_fpadvsimd (mode, "argument");
2408
2409       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2410         {
2411           pcum->aapcs_nextnvrn = nvrn + nregs;
2412           if (!aarch64_composite_type_p (type, mode))
2413             {
2414               gcc_assert (nregs == 1);
2415               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2416             }
2417           else
2418             {
2419               rtx par;
2420               int i;
2421               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2422               for (i = 0; i < nregs; i++)
2423                 {
2424                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2425                                          V0_REGNUM + nvrn + i);
2426                   tmp = gen_rtx_EXPR_LIST
2427                     (VOIDmode, tmp,
2428                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2429                   XVECEXP (par, 0, i) = tmp;
2430                 }
2431               pcum->aapcs_reg = par;
2432             }
2433           return;
2434         }
2435       else
2436         {
2437           /* C.3 NSRN is set to 8.  */
2438           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2439           goto on_stack;
2440         }
2441     }
2442
2443   ncrn = pcum->aapcs_ncrn;
2444   nregs = size / UNITS_PER_WORD;
2445
2446   /* C6 - C9.  though the sign and zero extension semantics are
2447      handled elsewhere.  This is the case where the argument fits
2448      entirely general registers.  */
2449   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2450     {
2451
2452       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2453
2454       /* C.8 if the argument has an alignment of 16 then the NGRN is
2455          rounded up to the next even number.  */
2456       if (nregs == 2
2457           && ncrn % 2
2458           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2459              comparison is there because for > 16 * BITS_PER_UNIT
2460              alignment nregs should be > 2 and therefore it should be
2461              passed by reference rather than value.  */
2462           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2463         {
2464           ++ncrn;
2465           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2466         }
2467
2468       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2469          A reg is still generated for it, but the caller should be smart
2470          enough not to use it.  */
2471       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2472         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2473       else
2474         {
2475           rtx par;
2476           int i;
2477
2478           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2479           for (i = 0; i < nregs; i++)
2480             {
2481               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2482               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2483                                        GEN_INT (i * UNITS_PER_WORD));
2484               XVECEXP (par, 0, i) = tmp;
2485             }
2486           pcum->aapcs_reg = par;
2487         }
2488
2489       pcum->aapcs_nextncrn = ncrn + nregs;
2490       return;
2491     }
2492
2493   /* C.11  */
2494   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2495
2496   /* The argument is passed on stack; record the needed number of words for
2497      this argument and align the total size if necessary.  */
2498 on_stack:
2499   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2500
2501   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2502     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2503                                        16 / UNITS_PER_WORD);
2504   return;
2505 }
2506
2507 /* Implement TARGET_FUNCTION_ARG.  */
2508
2509 static rtx
2510 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2511                       const_tree type, bool named)
2512 {
2513   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2514   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2515
2516   if (mode == VOIDmode)
2517     return NULL_RTX;
2518
2519   aarch64_layout_arg (pcum_v, mode, type, named);
2520   return pcum->aapcs_reg;
2521 }
2522
2523 void
2524 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2525                            const_tree fntype ATTRIBUTE_UNUSED,
2526                            rtx libname ATTRIBUTE_UNUSED,
2527                            const_tree fndecl ATTRIBUTE_UNUSED,
2528                            unsigned n_named ATTRIBUTE_UNUSED)
2529 {
2530   pcum->aapcs_ncrn = 0;
2531   pcum->aapcs_nvrn = 0;
2532   pcum->aapcs_nextncrn = 0;
2533   pcum->aapcs_nextnvrn = 0;
2534   pcum->pcs_variant = ARM_PCS_AAPCS64;
2535   pcum->aapcs_reg = NULL_RTX;
2536   pcum->aapcs_arg_processed = false;
2537   pcum->aapcs_stack_words = 0;
2538   pcum->aapcs_stack_size = 0;
2539
2540   if (!TARGET_FLOAT
2541       && fndecl && TREE_PUBLIC (fndecl)
2542       && fntype && fntype != error_mark_node)
2543     {
2544       const_tree type = TREE_TYPE (fntype);
2545       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2546       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2547       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2548                                                    &mode, &nregs, NULL))
2549         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2550     }
2551   return;
2552 }
2553
2554 static void
2555 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2556                               machine_mode mode,
2557                               const_tree type,
2558                               bool named)
2559 {
2560   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2561   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2562     {
2563       aarch64_layout_arg (pcum_v, mode, type, named);
2564       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2565                   != (pcum->aapcs_stack_words != 0));
2566       pcum->aapcs_arg_processed = false;
2567       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2568       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2569       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2570       pcum->aapcs_stack_words = 0;
2571       pcum->aapcs_reg = NULL_RTX;
2572     }
2573 }
2574
2575 bool
2576 aarch64_function_arg_regno_p (unsigned regno)
2577 {
2578   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2579           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2580 }
2581
2582 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2583    PARM_BOUNDARY bits of alignment, but will be given anything up
2584    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2585    that both before and after the layout of each argument, the Next
2586    Stacked Argument Address (NSAA) will have a minimum alignment of
2587    8 bytes.  */
2588
2589 static unsigned int
2590 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2591 {
2592   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2593   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2594 }
2595
2596 /* Implement TARGET_FUNCTION_ARG_PADDING.
2597
2598    Small aggregate types are placed in the lowest memory address.
2599
2600    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2601
2602 static pad_direction
2603 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2604 {
2605   /* On little-endian targets, the least significant byte of every stack
2606      argument is passed at the lowest byte address of the stack slot.  */
2607   if (!BYTES_BIG_ENDIAN)
2608     return PAD_UPWARD;
2609
2610   /* Otherwise, integral, floating-point and pointer types are padded downward:
2611      the least significant byte of a stack argument is passed at the highest
2612      byte address of the stack slot.  */
2613   if (type
2614       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2615          || POINTER_TYPE_P (type))
2616       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2617     return PAD_DOWNWARD;
2618
2619   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2620   return PAD_UPWARD;
2621 }
2622
2623 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2624
2625    It specifies padding for the last (may also be the only)
2626    element of a block move between registers and memory.  If
2627    assuming the block is in the memory, padding upward means that
2628    the last element is padded after its highest significant byte,
2629    while in downward padding, the last element is padded at the
2630    its least significant byte side.
2631
2632    Small aggregates and small complex types are always padded
2633    upwards.
2634
2635    We don't need to worry about homogeneous floating-point or
2636    short-vector aggregates; their move is not affected by the
2637    padding direction determined here.  Regardless of endianness,
2638    each element of such an aggregate is put in the least
2639    significant bits of a fp/simd register.
2640
2641    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2642    register has useful data, and return the opposite if the most
2643    significant byte does.  */
2644
2645 bool
2646 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2647                      bool first ATTRIBUTE_UNUSED)
2648 {
2649
2650   /* Small composite types are always padded upward.  */
2651   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2652     {
2653       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2654                             : GET_MODE_SIZE (mode));
2655       if (size < 2 * UNITS_PER_WORD)
2656         return true;
2657     }
2658
2659   /* Otherwise, use the default padding.  */
2660   return !BYTES_BIG_ENDIAN;
2661 }
2662
2663 static scalar_int_mode
2664 aarch64_libgcc_cmp_return_mode (void)
2665 {
2666   return SImode;
2667 }
2668
2669 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2670
2671 /* We use the 12-bit shifted immediate arithmetic instructions so values
2672    must be multiple of (1 << 12), i.e. 4096.  */
2673 #define ARITH_FACTOR 4096
2674
2675 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2676 #error Cannot use simple address calculation for stack probing
2677 #endif
2678
2679 /* The pair of scratch registers used for stack probing.  */
2680 #define PROBE_STACK_FIRST_REG  9
2681 #define PROBE_STACK_SECOND_REG 10
2682
2683 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2684    inclusive.  These are offsets from the current stack pointer.  */
2685
2686 static void
2687 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2688 {
2689   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2690
2691   /* See the same assertion on PROBE_INTERVAL above.  */
2692   gcc_assert ((first % ARITH_FACTOR) == 0);
2693
2694   /* See if we have a constant small number of probes to generate.  If so,
2695      that's the easy case.  */
2696   if (size <= PROBE_INTERVAL)
2697     {
2698       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2699
2700       emit_set_insn (reg1,
2701                      plus_constant (Pmode,
2702                                     stack_pointer_rtx, -(first + base)));
2703       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2704     }
2705
2706   /* The run-time loop is made up of 8 insns in the generic case while the
2707      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2708   else if (size <= 4 * PROBE_INTERVAL)
2709     {
2710       HOST_WIDE_INT i, rem;
2711
2712       emit_set_insn (reg1,
2713                      plus_constant (Pmode,
2714                                     stack_pointer_rtx,
2715                                     -(first + PROBE_INTERVAL)));
2716       emit_stack_probe (reg1);
2717
2718       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2719          it exceeds SIZE.  If only two probes are needed, this will not
2720          generate any code.  Then probe at FIRST + SIZE.  */
2721       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2722         {
2723           emit_set_insn (reg1,
2724                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2725           emit_stack_probe (reg1);
2726         }
2727
2728       rem = size - (i - PROBE_INTERVAL);
2729       if (rem > 256)
2730         {
2731           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2732
2733           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2734           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2735         }
2736       else
2737         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2738     }
2739
2740   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2741      extra careful with variables wrapping around because we might be at
2742      the very top (or the very bottom) of the address space and we have
2743      to be able to handle this case properly; in particular, we use an
2744      equality test for the loop condition.  */
2745   else
2746     {
2747       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2748
2749       /* Step 1: round SIZE to the previous multiple of the interval.  */
2750
2751       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2752
2753
2754       /* Step 2: compute initial and final value of the loop counter.  */
2755
2756       /* TEST_ADDR = SP + FIRST.  */
2757       emit_set_insn (reg1,
2758                      plus_constant (Pmode, stack_pointer_rtx, -first));
2759
2760       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2761       HOST_WIDE_INT adjustment = - (first + rounded_size);
2762       if (! aarch64_uimm12_shift (adjustment))
2763         {
2764           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2765                                           true, Pmode);
2766           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2767         }
2768       else
2769         {
2770           emit_set_insn (reg2,
2771                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2772         }
2773
2774       /* Step 3: the loop
2775
2776          do
2777            {
2778              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2779              probe at TEST_ADDR
2780            }
2781          while (TEST_ADDR != LAST_ADDR)
2782
2783          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2784          until it is equal to ROUNDED_SIZE.  */
2785
2786       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2787
2788
2789       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2790          that SIZE is equal to ROUNDED_SIZE.  */
2791
2792       if (size != rounded_size)
2793         {
2794           HOST_WIDE_INT rem = size - rounded_size;
2795
2796           if (rem > 256)
2797             {
2798               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2799
2800               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2801               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2802             }
2803           else
2804             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2805         }
2806     }
2807
2808   /* Make sure nothing is scheduled before we are done.  */
2809   emit_insn (gen_blockage ());
2810 }
2811
2812 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2813    absolute addresses.  */
2814
2815 const char *
2816 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2817 {
2818   static int labelno = 0;
2819   char loop_lab[32];
2820   rtx xops[2];
2821
2822   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2823
2824   /* Loop.  */
2825   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2826
2827   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2828   xops[0] = reg1;
2829   xops[1] = GEN_INT (PROBE_INTERVAL);
2830   output_asm_insn ("sub\t%0, %0, %1", xops);
2831
2832   /* Probe at TEST_ADDR.  */
2833   output_asm_insn ("str\txzr, [%0]", xops);
2834
2835   /* Test if TEST_ADDR == LAST_ADDR.  */
2836   xops[1] = reg2;
2837   output_asm_insn ("cmp\t%0, %1", xops);
2838
2839   /* Branch.  */
2840   fputs ("\tb.ne\t", asm_out_file);
2841   assemble_name_raw (asm_out_file, loop_lab);
2842   fputc ('\n', asm_out_file);
2843
2844   return "";
2845 }
2846
2847 static bool
2848 aarch64_frame_pointer_required (void)
2849 {
2850   /* In aarch64_override_options_after_change
2851      flag_omit_leaf_frame_pointer turns off the frame pointer by
2852      default.  Turn it back on now if we've not got a leaf
2853      function.  */
2854   if (flag_omit_leaf_frame_pointer
2855       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2856     return true;
2857
2858   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2859   if (crtl->calls_eh_return)
2860     return true;
2861
2862   return false;
2863 }
2864
2865 /* Mark the registers that need to be saved by the callee and calculate
2866    the size of the callee-saved registers area and frame record (both FP
2867    and LR may be omitted).  */
2868 static void
2869 aarch64_layout_frame (void)
2870 {
2871   HOST_WIDE_INT offset = 0;
2872   int regno, last_fp_reg = INVALID_REGNUM;
2873
2874   if (reload_completed && cfun->machine->frame.laid_out)
2875     return;
2876
2877 #define SLOT_NOT_REQUIRED (-2)
2878 #define SLOT_REQUIRED     (-1)
2879
2880   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2881   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2882
2883   /* First mark all the registers that really need to be saved...  */
2884   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2885     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2886
2887   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2888     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2889
2890   /* ... that includes the eh data registers (if needed)...  */
2891   if (crtl->calls_eh_return)
2892     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2893       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2894         = SLOT_REQUIRED;
2895
2896   /* ... and any callee saved register that dataflow says is live.  */
2897   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2898     if (df_regs_ever_live_p (regno)
2899         && (regno == R30_REGNUM
2900             || !call_used_regs[regno]))
2901       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2902
2903   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2904     if (df_regs_ever_live_p (regno)
2905         && !call_used_regs[regno])
2906       {
2907         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2908         last_fp_reg = regno;
2909       }
2910
2911   if (frame_pointer_needed)
2912     {
2913       /* FP and LR are placed in the linkage record.  */
2914       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2915       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2916       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2917       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2918       offset += 2 * UNITS_PER_WORD;
2919     }
2920
2921   /* Now assign stack slots for them.  */
2922   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2923     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2924       {
2925         cfun->machine->frame.reg_offset[regno] = offset;
2926         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2927           cfun->machine->frame.wb_candidate1 = regno;
2928         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2929           cfun->machine->frame.wb_candidate2 = regno;
2930         offset += UNITS_PER_WORD;
2931       }
2932
2933   HOST_WIDE_INT max_int_offset = offset;
2934   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2935   bool has_align_gap = offset != max_int_offset;
2936
2937   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2938     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2939       {
2940         /* If there is an alignment gap between integer and fp callee-saves,
2941            allocate the last fp register to it if possible.  */
2942         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2943           {
2944             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2945             break;
2946           }
2947
2948         cfun->machine->frame.reg_offset[regno] = offset;
2949         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2950           cfun->machine->frame.wb_candidate1 = regno;
2951         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2952                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2953           cfun->machine->frame.wb_candidate2 = regno;
2954         offset += UNITS_PER_WORD;
2955       }
2956
2957   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2958
2959   cfun->machine->frame.saved_regs_size = offset;
2960
2961   HOST_WIDE_INT varargs_and_saved_regs_size
2962     = offset + cfun->machine->frame.saved_varargs_size;
2963
2964   cfun->machine->frame.hard_fp_offset
2965     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2966                 STACK_BOUNDARY / BITS_PER_UNIT);
2967
2968   cfun->machine->frame.frame_size
2969     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2970                 + crtl->outgoing_args_size,
2971                 STACK_BOUNDARY / BITS_PER_UNIT);
2972
2973   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2974
2975   cfun->machine->frame.initial_adjust = 0;
2976   cfun->machine->frame.final_adjust = 0;
2977   cfun->machine->frame.callee_adjust = 0;
2978   cfun->machine->frame.callee_offset = 0;
2979
2980   HOST_WIDE_INT max_push_offset = 0;
2981   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2982     max_push_offset = 512;
2983   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2984     max_push_offset = 256;
2985
2986   if (cfun->machine->frame.frame_size < max_push_offset
2987       && crtl->outgoing_args_size == 0)
2988     {
2989       /* Simple, small frame with no outgoing arguments:
2990          stp reg1, reg2, [sp, -frame_size]!
2991          stp reg3, reg4, [sp, 16]  */
2992       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2993     }
2994   else if ((crtl->outgoing_args_size
2995             + cfun->machine->frame.saved_regs_size < 512)
2996            && !(cfun->calls_alloca
2997                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2998     {
2999       /* Frame with small outgoing arguments:
3000          sub sp, sp, frame_size
3001          stp reg1, reg2, [sp, outgoing_args_size]
3002          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3003       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3004       cfun->machine->frame.callee_offset
3005         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3006     }
3007   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3008     {
3009       /* Frame with large outgoing arguments but a small local area:
3010          stp reg1, reg2, [sp, -hard_fp_offset]!
3011          stp reg3, reg4, [sp, 16]
3012          sub sp, sp, outgoing_args_size  */
3013       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3014       cfun->machine->frame.final_adjust
3015         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3016     }
3017   else if (!frame_pointer_needed
3018            && varargs_and_saved_regs_size < max_push_offset)
3019     {
3020       /* Frame with large local area and outgoing arguments (this pushes the
3021          callee-saves first, followed by the locals and outgoing area):
3022          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3023          stp reg3, reg4, [sp, 16]
3024          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3025       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3026       cfun->machine->frame.final_adjust
3027         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3028       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3029       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3030     }
3031   else
3032     {
3033       /* Frame with large local area and outgoing arguments using frame pointer:
3034          sub sp, sp, hard_fp_offset
3035          stp x29, x30, [sp, 0]
3036          add x29, sp, 0
3037          stp reg3, reg4, [sp, 16]
3038          sub sp, sp, outgoing_args_size  */
3039       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3040       cfun->machine->frame.final_adjust
3041         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3042     }
3043
3044   cfun->machine->frame.laid_out = true;
3045 }
3046
3047 /* Return true if the register REGNO is saved on entry to
3048    the current function.  */
3049
3050 static bool
3051 aarch64_register_saved_on_entry (int regno)
3052 {
3053   return cfun->machine->frame.reg_offset[regno] >= 0;
3054 }
3055
3056 /* Return the next register up from REGNO up to LIMIT for the callee
3057    to save.  */
3058
3059 static unsigned
3060 aarch64_next_callee_save (unsigned regno, unsigned limit)
3061 {
3062   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3063     regno ++;
3064   return regno;
3065 }
3066
3067 /* Push the register number REGNO of mode MODE to the stack with write-back
3068    adjusting the stack by ADJUSTMENT.  */
3069
3070 static void
3071 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3072                            HOST_WIDE_INT adjustment)
3073  {
3074   rtx base_rtx = stack_pointer_rtx;
3075   rtx insn, reg, mem;
3076
3077   reg = gen_rtx_REG (mode, regno);
3078   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3079                             plus_constant (Pmode, base_rtx, -adjustment));
3080   mem = gen_frame_mem (mode, mem);
3081
3082   insn = emit_move_insn (mem, reg);
3083   RTX_FRAME_RELATED_P (insn) = 1;
3084 }
3085
3086 /* Generate and return an instruction to store the pair of registers
3087    REG and REG2 of mode MODE to location BASE with write-back adjusting
3088    the stack location BASE by ADJUSTMENT.  */
3089
3090 static rtx
3091 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3092                           HOST_WIDE_INT adjustment)
3093 {
3094   switch (mode)
3095     {
3096     case E_DImode:
3097       return gen_storewb_pairdi_di (base, base, reg, reg2,
3098                                     GEN_INT (-adjustment),
3099                                     GEN_INT (UNITS_PER_WORD - adjustment));
3100     case E_DFmode:
3101       return gen_storewb_pairdf_di (base, base, reg, reg2,
3102                                     GEN_INT (-adjustment),
3103                                     GEN_INT (UNITS_PER_WORD - adjustment));
3104     default:
3105       gcc_unreachable ();
3106     }
3107 }
3108
3109 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3110    stack pointer by ADJUSTMENT.  */
3111
3112 static void
3113 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3114 {
3115   rtx_insn *insn;
3116   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3117
3118   if (regno2 == INVALID_REGNUM)
3119     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3120
3121   rtx reg1 = gen_rtx_REG (mode, regno1);
3122   rtx reg2 = gen_rtx_REG (mode, regno2);
3123
3124   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3125                                               reg2, adjustment));
3126   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3127   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3128   RTX_FRAME_RELATED_P (insn) = 1;
3129 }
3130
3131 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3132    adjusting it by ADJUSTMENT afterwards.  */
3133
3134 static rtx
3135 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3136                          HOST_WIDE_INT adjustment)
3137 {
3138   switch (mode)
3139     {
3140     case E_DImode:
3141       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3142                                    GEN_INT (UNITS_PER_WORD));
3143     case E_DFmode:
3144       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3145                                    GEN_INT (UNITS_PER_WORD));
3146     default:
3147       gcc_unreachable ();
3148     }
3149 }
3150
3151 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3152    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3153    into CFI_OPS.  */
3154
3155 static void
3156 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3157                   rtx *cfi_ops)
3158 {
3159   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3160   rtx reg1 = gen_rtx_REG (mode, regno1);
3161
3162   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3163
3164   if (regno2 == INVALID_REGNUM)
3165     {
3166       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3167       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3168       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3169     }
3170   else
3171     {
3172       rtx reg2 = gen_rtx_REG (mode, regno2);
3173       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3174       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3175                                           reg2, adjustment));
3176     }
3177 }
3178
3179 /* Generate and return a store pair instruction of mode MODE to store
3180    register REG1 to MEM1 and register REG2 to MEM2.  */
3181
3182 static rtx
3183 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3184                         rtx reg2)
3185 {
3186   switch (mode)
3187     {
3188     case E_DImode:
3189       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3190
3191     case E_DFmode:
3192       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3193
3194     default:
3195       gcc_unreachable ();
3196     }
3197 }
3198
3199 /* Generate and regurn a load pair isntruction of mode MODE to load register
3200    REG1 from MEM1 and register REG2 from MEM2.  */
3201
3202 static rtx
3203 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3204                        rtx mem2)
3205 {
3206   switch (mode)
3207     {
3208     case E_DImode:
3209       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3210
3211     case E_DFmode:
3212       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3213
3214     default:
3215       gcc_unreachable ();
3216     }
3217 }
3218
3219 /* Return TRUE if return address signing should be enabled for the current
3220    function, otherwise return FALSE.  */
3221
3222 bool
3223 aarch64_return_address_signing_enabled (void)
3224 {
3225   /* This function should only be called after frame laid out.   */
3226   gcc_assert (cfun->machine->frame.laid_out);
3227
3228   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3229      if it's LR is pushed onto stack.  */
3230   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3231           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3232               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3233 }
3234
3235 /* Emit code to save the callee-saved registers from register number START
3236    to LIMIT to the stack at the location starting at offset START_OFFSET,
3237    skipping any write-back candidates if SKIP_WB is true.  */
3238
3239 static void
3240 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3241                            unsigned start, unsigned limit, bool skip_wb)
3242 {
3243   rtx_insn *insn;
3244   unsigned regno;
3245   unsigned regno2;
3246
3247   for (regno = aarch64_next_callee_save (start, limit);
3248        regno <= limit;
3249        regno = aarch64_next_callee_save (regno + 1, limit))
3250     {
3251       rtx reg, mem;
3252       HOST_WIDE_INT offset;
3253
3254       if (skip_wb
3255           && (regno == cfun->machine->frame.wb_candidate1
3256               || regno == cfun->machine->frame.wb_candidate2))
3257         continue;
3258
3259       if (cfun->machine->reg_is_wrapped_separately[regno])
3260        continue;
3261
3262       reg = gen_rtx_REG (mode, regno);
3263       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3264       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3265                                                 offset));
3266
3267       regno2 = aarch64_next_callee_save (regno + 1, limit);
3268
3269       if (regno2 <= limit
3270           && !cfun->machine->reg_is_wrapped_separately[regno2]
3271           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3272               == cfun->machine->frame.reg_offset[regno2]))
3273
3274         {
3275           rtx reg2 = gen_rtx_REG (mode, regno2);
3276           rtx mem2;
3277
3278           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3279           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3280                                                      offset));
3281           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3282                                                     reg2));
3283
3284           /* The first part of a frame-related parallel insn is
3285              always assumed to be relevant to the frame
3286              calculations; subsequent parts, are only
3287              frame-related if explicitly marked.  */
3288           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3289           regno = regno2;
3290         }
3291       else
3292         insn = emit_move_insn (mem, reg);
3293
3294       RTX_FRAME_RELATED_P (insn) = 1;
3295     }
3296 }
3297
3298 /* Emit code to restore the callee registers of mode MODE from register
3299    number START up to and including LIMIT.  Restore from the stack offset
3300    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3301    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3302
3303 static void
3304 aarch64_restore_callee_saves (machine_mode mode,
3305                               HOST_WIDE_INT start_offset, unsigned start,
3306                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3307 {
3308   rtx base_rtx = stack_pointer_rtx;
3309   unsigned regno;
3310   unsigned regno2;
3311   HOST_WIDE_INT offset;
3312
3313   for (regno = aarch64_next_callee_save (start, limit);
3314        regno <= limit;
3315        regno = aarch64_next_callee_save (regno + 1, limit))
3316     {
3317       if (cfun->machine->reg_is_wrapped_separately[regno])
3318        continue;
3319
3320       rtx reg, mem;
3321
3322       if (skip_wb
3323           && (regno == cfun->machine->frame.wb_candidate1
3324               || regno == cfun->machine->frame.wb_candidate2))
3325         continue;
3326
3327       reg = gen_rtx_REG (mode, regno);
3328       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3329       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3330
3331       regno2 = aarch64_next_callee_save (regno + 1, limit);
3332
3333       if (regno2 <= limit
3334           && !cfun->machine->reg_is_wrapped_separately[regno2]
3335           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3336               == cfun->machine->frame.reg_offset[regno2]))
3337         {
3338           rtx reg2 = gen_rtx_REG (mode, regno2);
3339           rtx mem2;
3340
3341           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3342           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3343           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3344
3345           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3346           regno = regno2;
3347         }
3348       else
3349         emit_move_insn (reg, mem);
3350       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3351     }
3352 }
3353
3354 static inline bool
3355 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3356                                HOST_WIDE_INT offset)
3357 {
3358   return offset >= -256 && offset < 256;
3359 }
3360
3361 static inline bool
3362 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3363 {
3364   return (offset >= 0
3365           && offset < 4096 * GET_MODE_SIZE (mode)
3366           && offset % GET_MODE_SIZE (mode) == 0);
3367 }
3368
3369 bool
3370 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3371 {
3372   return (offset >= -64 * GET_MODE_SIZE (mode)
3373           && offset < 64 * GET_MODE_SIZE (mode)
3374           && offset % GET_MODE_SIZE (mode) == 0);
3375 }
3376
3377 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3378
3379 static sbitmap
3380 aarch64_get_separate_components (void)
3381 {
3382   aarch64_layout_frame ();
3383
3384   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3385   bitmap_clear (components);
3386
3387   /* The registers we need saved to the frame.  */
3388   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3389     if (aarch64_register_saved_on_entry (regno))
3390       {
3391         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3392         if (!frame_pointer_needed)
3393           offset += cfun->machine->frame.frame_size
3394                     - cfun->machine->frame.hard_fp_offset;
3395         /* Check that we can access the stack slot of the register with one
3396            direct load with no adjustments needed.  */
3397         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3398           bitmap_set_bit (components, regno);
3399       }
3400
3401   /* Don't mess with the hard frame pointer.  */
3402   if (frame_pointer_needed)
3403     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3404
3405   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3406   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3407   /* If aarch64_layout_frame has chosen registers to store/restore with
3408      writeback don't interfere with them to avoid having to output explicit
3409      stack adjustment instructions.  */
3410   if (reg2 != INVALID_REGNUM)
3411     bitmap_clear_bit (components, reg2);
3412   if (reg1 != INVALID_REGNUM)
3413     bitmap_clear_bit (components, reg1);
3414
3415   bitmap_clear_bit (components, LR_REGNUM);
3416   bitmap_clear_bit (components, SP_REGNUM);
3417
3418   return components;
3419 }
3420
3421 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3422
3423 static sbitmap
3424 aarch64_components_for_bb (basic_block bb)
3425 {
3426   bitmap in = DF_LIVE_IN (bb);
3427   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3428   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3429
3430   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3431   bitmap_clear (components);
3432
3433   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3434   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3435     if ((!call_used_regs[regno])
3436        && (bitmap_bit_p (in, regno)
3437            || bitmap_bit_p (gen, regno)
3438            || bitmap_bit_p (kill, regno)))
3439           bitmap_set_bit (components, regno);
3440
3441   return components;
3442 }
3443
3444 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3445    Nothing to do for aarch64.  */
3446
3447 static void
3448 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3449 {
3450 }
3451
3452 /* Return the next set bit in BMP from START onwards.  Return the total number
3453    of bits in BMP if no set bit is found at or after START.  */
3454
3455 static unsigned int
3456 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3457 {
3458   unsigned int nbits = SBITMAP_SIZE (bmp);
3459   if (start == nbits)
3460     return start;
3461
3462   gcc_assert (start < nbits);
3463   for (unsigned int i = start; i < nbits; i++)
3464     if (bitmap_bit_p (bmp, i))
3465       return i;
3466
3467   return nbits;
3468 }
3469
3470 /* Do the work for aarch64_emit_prologue_components and
3471    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3472    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3473    for these components or the epilogue sequence.  That is, it determines
3474    whether we should emit stores or loads and what kind of CFA notes to attach
3475    to the insns.  Otherwise the logic for the two sequences is very
3476    similar.  */
3477
3478 static void
3479 aarch64_process_components (sbitmap components, bool prologue_p)
3480 {
3481   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3482                              ? HARD_FRAME_POINTER_REGNUM
3483                              : STACK_POINTER_REGNUM);
3484
3485   unsigned last_regno = SBITMAP_SIZE (components);
3486   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3487   rtx_insn *insn = NULL;
3488
3489   while (regno != last_regno)
3490     {
3491       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3492          so DFmode for the vector registers is enough.  */
3493       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3494       rtx reg = gen_rtx_REG (mode, regno);
3495       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3496       if (!frame_pointer_needed)
3497         offset += cfun->machine->frame.frame_size
3498                   - cfun->machine->frame.hard_fp_offset;
3499       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3500       rtx mem = gen_frame_mem (mode, addr);
3501
3502       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3503       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3504       /* No more registers to handle after REGNO.
3505          Emit a single save/restore and exit.  */
3506       if (regno2 == last_regno)
3507         {
3508           insn = emit_insn (set);
3509           RTX_FRAME_RELATED_P (insn) = 1;
3510           if (prologue_p)
3511             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3512           else
3513             add_reg_note (insn, REG_CFA_RESTORE, reg);
3514           break;
3515         }
3516
3517       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3518       /* The next register is not of the same class or its offset is not
3519          mergeable with the current one into a pair.  */
3520       if (!satisfies_constraint_Ump (mem)
3521           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3522           || (offset2 - cfun->machine->frame.reg_offset[regno])
3523                 != GET_MODE_SIZE (mode))
3524         {
3525           insn = emit_insn (set);
3526           RTX_FRAME_RELATED_P (insn) = 1;
3527           if (prologue_p)
3528             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3529           else
3530             add_reg_note (insn, REG_CFA_RESTORE, reg);
3531
3532           regno = regno2;
3533           continue;
3534         }
3535
3536       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3537       rtx reg2 = gen_rtx_REG (mode, regno2);
3538       if (!frame_pointer_needed)
3539         offset2 += cfun->machine->frame.frame_size
3540                   - cfun->machine->frame.hard_fp_offset;
3541       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3542       rtx mem2 = gen_frame_mem (mode, addr2);
3543       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3544                              : gen_rtx_SET (reg2, mem2);
3545
3546       if (prologue_p)
3547         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3548       else
3549         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3550
3551       RTX_FRAME_RELATED_P (insn) = 1;
3552       if (prologue_p)
3553         {
3554           add_reg_note (insn, REG_CFA_OFFSET, set);
3555           add_reg_note (insn, REG_CFA_OFFSET, set2);
3556         }
3557       else
3558         {
3559           add_reg_note (insn, REG_CFA_RESTORE, reg);
3560           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3561         }
3562
3563       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3564     }
3565 }
3566
3567 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3568
3569 static void
3570 aarch64_emit_prologue_components (sbitmap components)
3571 {
3572   aarch64_process_components (components, true);
3573 }
3574
3575 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3576
3577 static void
3578 aarch64_emit_epilogue_components (sbitmap components)
3579 {
3580   aarch64_process_components (components, false);
3581 }
3582
3583 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3584
3585 static void
3586 aarch64_set_handled_components (sbitmap components)
3587 {
3588   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3589     if (bitmap_bit_p (components, regno))
3590       cfun->machine->reg_is_wrapped_separately[regno] = true;
3591 }
3592
3593 /* AArch64 stack frames generated by this compiler look like:
3594
3595         +-------------------------------+
3596         |                               |
3597         |  incoming stack arguments     |
3598         |                               |
3599         +-------------------------------+
3600         |                               | <-- incoming stack pointer (aligned)
3601         |  callee-allocated save area   |
3602         |  for register varargs         |
3603         |                               |
3604         +-------------------------------+
3605         |  local variables              | <-- frame_pointer_rtx
3606         |                               |
3607         +-------------------------------+
3608         |  padding0                     | \
3609         +-------------------------------+  |
3610         |  callee-saved registers       |  | frame.saved_regs_size
3611         +-------------------------------+  |
3612         |  LR'                          |  |
3613         +-------------------------------+  |
3614         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3615         +-------------------------------+
3616         |  dynamic allocation           |
3617         +-------------------------------+
3618         |  padding                      |
3619         +-------------------------------+
3620         |  outgoing stack arguments     | <-- arg_pointer
3621         |                               |
3622         +-------------------------------+
3623         |                               | <-- stack_pointer_rtx (aligned)
3624
3625    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3626    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3627    unchanged.  */
3628
3629 /* Generate the prologue instructions for entry into a function.
3630    Establish the stack frame by decreasing the stack pointer with a
3631    properly calculated size and, if necessary, create a frame record
3632    filled with the values of LR and previous frame pointer.  The
3633    current FP is also set up if it is in use.  */
3634
3635 void
3636 aarch64_expand_prologue (void)
3637 {
3638   aarch64_layout_frame ();
3639
3640   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3641   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3642   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3643   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3644   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3645   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3646   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3647   rtx_insn *insn;
3648
3649   /* Sign return address for functions.  */
3650   if (aarch64_return_address_signing_enabled ())
3651     {
3652       insn = emit_insn (gen_pacisp ());
3653       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3654       RTX_FRAME_RELATED_P (insn) = 1;
3655     }
3656
3657   if (flag_stack_usage_info)
3658     current_function_static_stack_size = frame_size;
3659
3660   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3661     {
3662       if (crtl->is_leaf && !cfun->calls_alloca)
3663         {
3664           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3665             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3666                                             frame_size - STACK_CHECK_PROTECT);
3667         }
3668       else if (frame_size > 0)
3669         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3670     }
3671
3672   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3673
3674   if (callee_adjust != 0)
3675     aarch64_push_regs (reg1, reg2, callee_adjust);
3676
3677   if (frame_pointer_needed)
3678     {
3679       if (callee_adjust == 0)
3680         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3681                                    R30_REGNUM, false);
3682       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3683                                        stack_pointer_rtx,
3684                                        GEN_INT (callee_offset)));
3685       RTX_FRAME_RELATED_P (insn) = 1;
3686       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3687     }
3688
3689   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3690                              callee_adjust != 0 || frame_pointer_needed);
3691   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3692                              callee_adjust != 0 || frame_pointer_needed);
3693   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3694 }
3695
3696 /* Return TRUE if we can use a simple_return insn.
3697
3698    This function checks whether the callee saved stack is empty, which
3699    means no restore actions are need. The pro_and_epilogue will use
3700    this to check whether shrink-wrapping opt is feasible.  */
3701
3702 bool
3703 aarch64_use_return_insn_p (void)
3704 {
3705   if (!reload_completed)
3706     return false;
3707
3708   if (crtl->profile)
3709     return false;
3710
3711   aarch64_layout_frame ();
3712
3713   return cfun->machine->frame.frame_size == 0;
3714 }
3715
3716 /* Generate the epilogue instructions for returning from a function.
3717    This is almost exactly the reverse of the prolog sequence, except
3718    that we need to insert barriers to avoid scheduling loads that read
3719    from a deallocated stack, and we optimize the unwind records by
3720    emitting them all together if possible.  */
3721 void
3722 aarch64_expand_epilogue (bool for_sibcall)
3723 {
3724   aarch64_layout_frame ();
3725
3726   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3727   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3728   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3729   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3730   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3731   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3732   rtx cfi_ops = NULL;
3733   rtx_insn *insn;
3734
3735   /* We need to add memory barrier to prevent read from deallocated stack.  */
3736   bool need_barrier_p = (get_frame_size ()
3737                          + cfun->machine->frame.saved_varargs_size) != 0;
3738
3739   /* Emit a barrier to prevent loads from a deallocated stack.  */
3740   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3741       || crtl->calls_eh_return)
3742     {
3743       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3744       need_barrier_p = false;
3745     }
3746
3747   /* Restore the stack pointer from the frame pointer if it may not
3748      be the same as the stack pointer.  */
3749   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3750     {
3751       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3752                                        hard_frame_pointer_rtx,
3753                                        GEN_INT (-callee_offset)));
3754       /* If writeback is used when restoring callee-saves, the CFA
3755          is restored on the instruction doing the writeback.  */
3756       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3757     }
3758   else
3759     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3760
3761   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3762                                 callee_adjust != 0, &cfi_ops);
3763   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3764                                 callee_adjust != 0, &cfi_ops);
3765
3766   if (need_barrier_p)
3767     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3768
3769   if (callee_adjust != 0)
3770     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3771
3772   if (callee_adjust != 0 || initial_adjust > 65536)
3773     {
3774       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3775       insn = get_last_insn ();
3776       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3777       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3778       RTX_FRAME_RELATED_P (insn) = 1;
3779       cfi_ops = NULL;
3780     }
3781
3782   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3783
3784   if (cfi_ops)
3785     {
3786       /* Emit delayed restores and reset the CFA to be SP.  */
3787       insn = get_last_insn ();
3788       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3789       REG_NOTES (insn) = cfi_ops;
3790       RTX_FRAME_RELATED_P (insn) = 1;
3791     }
3792
3793   /* We prefer to emit the combined return/authenticate instruction RETAA,
3794      however there are three cases in which we must instead emit an explicit
3795      authentication instruction.
3796
3797         1) Sibcalls don't return in a normal way, so if we're about to call one
3798            we must authenticate.
3799
3800         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3801            generating code for !TARGET_ARMV8_3 we can't use it and must
3802            explicitly authenticate.
3803
3804         3) On an eh_return path we make extra stack adjustments to update the
3805            canonical frame address to be the exception handler's CFA.  We want
3806            to authenticate using the CFA of the function which calls eh_return.
3807     */
3808   if (aarch64_return_address_signing_enabled ()
3809       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3810     {
3811       insn = emit_insn (gen_autisp ());
3812       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3813       RTX_FRAME_RELATED_P (insn) = 1;
3814     }
3815
3816   /* Stack adjustment for exception handler.  */
3817   if (crtl->calls_eh_return)
3818     {
3819       /* We need to unwind the stack by the offset computed by
3820          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3821          to be SP; letting the CFA move during this adjustment
3822          is just as correct as retaining the CFA from the body
3823          of the function.  Therefore, do nothing special.  */
3824       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3825     }
3826
3827   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3828   if (!for_sibcall)
3829     emit_jump_insn (ret_rtx);
3830 }
3831
3832 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3833    normally or return to a previous frame after unwinding.
3834
3835    An EH return uses a single shared return sequence.  The epilogue is
3836    exactly like a normal epilogue except that it has an extra input
3837    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3838    that must be applied after the frame has been destroyed.  An extra label
3839    is inserted before the epilogue which initializes this register to zero,
3840    and this is the entry point for a normal return.
3841
3842    An actual EH return updates the return address, initializes the stack
3843    adjustment and jumps directly into the epilogue (bypassing the zeroing
3844    of the adjustment).  Since the return address is typically saved on the
3845    stack when a function makes a call, the saved LR must be updated outside
3846    the epilogue.
3847
3848    This poses problems as the store is generated well before the epilogue,
3849    so the offset of LR is not known yet.  Also optimizations will remove the
3850    store as it appears dead, even after the epilogue is generated (as the
3851    base or offset for loading LR is different in many cases).
3852
3853    To avoid these problems this implementation forces the frame pointer
3854    in eh_return functions so that the location of LR is fixed and known early.
3855    It also marks the store volatile, so no optimization is permitted to
3856    remove the store.  */
3857 rtx
3858 aarch64_eh_return_handler_rtx (void)
3859 {
3860   rtx tmp = gen_frame_mem (Pmode,
3861     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3862
3863   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3864   MEM_VOLATILE_P (tmp) = true;
3865   return tmp;
3866 }
3867
3868 /* Output code to add DELTA to the first argument, and then jump
3869    to FUNCTION.  Used for C++ multiple inheritance.  */
3870 static void
3871 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3872                          HOST_WIDE_INT delta,
3873                          HOST_WIDE_INT vcall_offset,
3874                          tree function)
3875 {
3876   /* The this pointer is always in x0.  Note that this differs from
3877      Arm where the this pointer maybe bumped to r1 if r0 is required
3878      to return a pointer to an aggregate.  On AArch64 a result value
3879      pointer will be in x8.  */
3880   int this_regno = R0_REGNUM;
3881   rtx this_rtx, temp0, temp1, addr, funexp;
3882   rtx_insn *insn;
3883
3884   reload_completed = 1;
3885   emit_note (NOTE_INSN_PROLOGUE_END);
3886
3887   if (vcall_offset == 0)
3888     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3889   else
3890     {
3891       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3892
3893       this_rtx = gen_rtx_REG (Pmode, this_regno);
3894       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3895       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3896
3897       addr = this_rtx;
3898       if (delta != 0)
3899         {
3900           if (delta >= -256 && delta < 256)
3901             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3902                                        plus_constant (Pmode, this_rtx, delta));
3903           else
3904             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3905         }
3906
3907       if (Pmode == ptr_mode)
3908         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3909       else
3910         aarch64_emit_move (temp0,
3911                            gen_rtx_ZERO_EXTEND (Pmode,
3912                                                 gen_rtx_MEM (ptr_mode, addr)));
3913
3914       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3915           addr = plus_constant (Pmode, temp0, vcall_offset);
3916       else
3917         {
3918           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3919                                           Pmode);
3920           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3921         }
3922
3923       if (Pmode == ptr_mode)
3924         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3925       else
3926         aarch64_emit_move (temp1,
3927                            gen_rtx_SIGN_EXTEND (Pmode,
3928                                                 gen_rtx_MEM (ptr_mode, addr)));
3929
3930       emit_insn (gen_add2_insn (this_rtx, temp1));
3931     }
3932
3933   /* Generate a tail call to the target function.  */
3934   if (!TREE_USED (function))
3935     {
3936       assemble_external (function);
3937       TREE_USED (function) = 1;
3938     }
3939   funexp = XEXP (DECL_RTL (function), 0);
3940   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3941   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3942   SIBLING_CALL_P (insn) = 1;
3943
3944   insn = get_insns ();
3945   shorten_branches (insn);
3946   final_start_function (insn, file, 1);
3947   final (insn, file, 1);
3948   final_end_function ();
3949
3950   /* Stop pretending to be a post-reload pass.  */
3951   reload_completed = 0;
3952 }
3953
3954 static bool
3955 aarch64_tls_referenced_p (rtx x)
3956 {
3957   if (!TARGET_HAVE_TLS)
3958     return false;
3959   subrtx_iterator::array_type array;
3960   FOR_EACH_SUBRTX (iter, array, x, ALL)
3961     {
3962       const_rtx x = *iter;
3963       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3964         return true;
3965       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3966          TLS offsets, not real symbol references.  */
3967       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3968         iter.skip_subrtxes ();
3969     }
3970   return false;
3971 }
3972
3973
3974 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3975    a left shift of 0 or 12 bits.  */
3976 bool
3977 aarch64_uimm12_shift (HOST_WIDE_INT val)
3978 {
3979   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3980           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3981           );
3982 }
3983
3984
3985 /* Return true if val is an immediate that can be loaded into a
3986    register by a MOVZ instruction.  */
3987 static bool
3988 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3989 {
3990   if (GET_MODE_SIZE (mode) > 4)
3991     {
3992       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3993           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3994         return 1;
3995     }
3996   else
3997     {
3998       /* Ignore sign extension.  */
3999       val &= (HOST_WIDE_INT) 0xffffffff;
4000     }
4001   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4002           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4003 }
4004
4005 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4006
4007 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4008   {
4009     0x0000000100000001ull,
4010     0x0001000100010001ull,
4011     0x0101010101010101ull,
4012     0x1111111111111111ull,
4013     0x5555555555555555ull,
4014   };
4015
4016
4017 /* Return true if val is a valid bitmask immediate.  */
4018
4019 bool
4020 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4021 {
4022   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4023   int bits;
4024
4025   /* Check for a single sequence of one bits and return quickly if so.
4026      The special cases of all ones and all zeroes returns false.  */
4027   val = (unsigned HOST_WIDE_INT) val_in;
4028   tmp = val + (val & -val);
4029
4030   if (tmp == (tmp & -tmp))
4031     return (val + 1) > 1;
4032
4033   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4034   if (mode == SImode)
4035     val = (val << 32) | (val & 0xffffffff);
4036
4037   /* Invert if the immediate doesn't start with a zero bit - this means we
4038      only need to search for sequences of one bits.  */
4039   if (val & 1)
4040     val = ~val;
4041
4042   /* Find the first set bit and set tmp to val with the first sequence of one
4043      bits removed.  Return success if there is a single sequence of ones.  */
4044   first_one = val & -val;
4045   tmp = val & (val + first_one);
4046
4047   if (tmp == 0)
4048     return true;
4049
4050   /* Find the next set bit and compute the difference in bit position.  */
4051   next_one = tmp & -tmp;
4052   bits = clz_hwi (first_one) - clz_hwi (next_one);
4053   mask = val ^ tmp;
4054
4055   /* Check the bit position difference is a power of 2, and that the first
4056      sequence of one bits fits within 'bits' bits.  */
4057   if ((mask >> bits) != 0 || bits != (bits & -bits))
4058     return false;
4059
4060   /* Check the sequence of one bits is repeated 64/bits times.  */
4061   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4062 }
4063
4064 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4065    Assumed precondition: VAL_IN Is not zero.  */
4066
4067 unsigned HOST_WIDE_INT
4068 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4069 {
4070   int lowest_bit_set = ctz_hwi (val_in);
4071   int highest_bit_set = floor_log2 (val_in);
4072   gcc_assert (val_in != 0);
4073
4074   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4075           (HOST_WIDE_INT_1U << lowest_bit_set));
4076 }
4077
4078 /* Create constant where bits outside of lowest bit set to highest bit set
4079    are set to 1.  */
4080
4081 unsigned HOST_WIDE_INT
4082 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4083 {
4084   return val_in | ~aarch64_and_split_imm1 (val_in);
4085 }
4086
4087 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4088
4089 bool
4090 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4091 {
4092   if (aarch64_bitmask_imm (val_in, mode))
4093     return false;
4094
4095   if (aarch64_move_imm (val_in, mode))
4096     return false;
4097
4098   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4099
4100   return aarch64_bitmask_imm (imm2, mode);
4101 }
4102
4103 /* Return true if val is an immediate that can be loaded into a
4104    register in a single instruction.  */
4105 bool
4106 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4107 {
4108   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4109     return 1;
4110   return aarch64_bitmask_imm (val, mode);
4111 }
4112
4113 static bool
4114 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4115 {
4116   rtx base, offset;
4117
4118   if (GET_CODE (x) == HIGH)
4119     return true;
4120
4121   split_const (x, &base, &offset);
4122   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4123     {
4124       if (aarch64_classify_symbol (base, offset)
4125           != SYMBOL_FORCE_TO_MEM)
4126         return true;
4127       else
4128         /* Avoid generating a 64-bit relocation in ILP32; leave
4129            to aarch64_expand_mov_immediate to handle it properly.  */
4130         return mode != ptr_mode;
4131     }
4132
4133   return aarch64_tls_referenced_p (x);
4134 }
4135
4136 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4137    The expansion for a table switch is quite expensive due to the number
4138    of instructions, the table lookup and hard to predict indirect jump.
4139    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4140    set, otherwise use tables for > 16 cases as a tradeoff between size and
4141    performance.  When optimizing for size, use the default setting.  */
4142
4143 static unsigned int
4144 aarch64_case_values_threshold (void)
4145 {
4146   /* Use the specified limit for the number of cases before using jump
4147      tables at higher optimization levels.  */
4148   if (optimize > 2
4149       && selected_cpu->tune->max_case_values != 0)
4150     return selected_cpu->tune->max_case_values;
4151   else
4152     return optimize_size ? default_case_values_threshold () : 17;
4153 }
4154
4155 /* Return true if register REGNO is a valid index register.
4156    STRICT_P is true if REG_OK_STRICT is in effect.  */
4157
4158 bool
4159 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4160 {
4161   if (!HARD_REGISTER_NUM_P (regno))
4162     {
4163       if (!strict_p)
4164         return true;
4165
4166       if (!reg_renumber)
4167         return false;
4168
4169       regno = reg_renumber[regno];
4170     }
4171   return GP_REGNUM_P (regno);
4172 }
4173
4174 /* Return true if register REGNO is a valid base register for mode MODE.
4175    STRICT_P is true if REG_OK_STRICT is in effect.  */
4176
4177 bool
4178 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4179 {
4180   if (!HARD_REGISTER_NUM_P (regno))
4181     {
4182       if (!strict_p)
4183         return true;
4184
4185       if (!reg_renumber)
4186         return false;
4187
4188       regno = reg_renumber[regno];
4189     }
4190
4191   /* The fake registers will be eliminated to either the stack or
4192      hard frame pointer, both of which are usually valid base registers.
4193      Reload deals with the cases where the eliminated form isn't valid.  */
4194   return (GP_REGNUM_P (regno)
4195           || regno == SP_REGNUM
4196           || regno == FRAME_POINTER_REGNUM
4197           || regno == ARG_POINTER_REGNUM);
4198 }
4199
4200 /* Return true if X is a valid base register for mode MODE.
4201    STRICT_P is true if REG_OK_STRICT is in effect.  */
4202
4203 static bool
4204 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4205 {
4206   if (!strict_p
4207       && GET_CODE (x) == SUBREG
4208       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4209     x = SUBREG_REG (x);
4210
4211   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4212 }
4213
4214 /* Return true if address offset is a valid index.  If it is, fill in INFO
4215    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4216
4217 static bool
4218 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4219                         machine_mode mode, bool strict_p)
4220 {
4221   enum aarch64_address_type type;
4222   rtx index;
4223   int shift;
4224
4225   /* (reg:P) */
4226   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4227       && GET_MODE (x) == Pmode)
4228     {
4229       type = ADDRESS_REG_REG;
4230       index = x;
4231       shift = 0;
4232     }
4233   /* (sign_extend:DI (reg:SI)) */
4234   else if ((GET_CODE (x) == SIGN_EXTEND
4235             || GET_CODE (x) == ZERO_EXTEND)
4236            && GET_MODE (x) == DImode
4237            && GET_MODE (XEXP (x, 0)) == SImode)
4238     {
4239       type = (GET_CODE (x) == SIGN_EXTEND)
4240         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4241       index = XEXP (x, 0);
4242       shift = 0;
4243     }
4244   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4245   else if (GET_CODE (x) == MULT
4246            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4247                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4248            && GET_MODE (XEXP (x, 0)) == DImode
4249            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4250            && CONST_INT_P (XEXP (x, 1)))
4251     {
4252       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4253         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4254       index = XEXP (XEXP (x, 0), 0);
4255       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4256     }
4257   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4258   else if (GET_CODE (x) == ASHIFT
4259            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4260                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4261            && GET_MODE (XEXP (x, 0)) == DImode
4262            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4263            && CONST_INT_P (XEXP (x, 1)))
4264     {
4265       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4266         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267       index = XEXP (XEXP (x, 0), 0);
4268       shift = INTVAL (XEXP (x, 1));
4269     }
4270   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4271   else if ((GET_CODE (x) == SIGN_EXTRACT
4272             || GET_CODE (x) == ZERO_EXTRACT)
4273            && GET_MODE (x) == DImode
4274            && GET_CODE (XEXP (x, 0)) == MULT
4275            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4276            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4277     {
4278       type = (GET_CODE (x) == SIGN_EXTRACT)
4279         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280       index = XEXP (XEXP (x, 0), 0);
4281       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4282       if (INTVAL (XEXP (x, 1)) != 32 + shift
4283           || INTVAL (XEXP (x, 2)) != 0)
4284         shift = -1;
4285     }
4286   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4287      (const_int 0xffffffff<<shift)) */
4288   else if (GET_CODE (x) == AND
4289            && GET_MODE (x) == DImode
4290            && GET_CODE (XEXP (x, 0)) == MULT
4291            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4292            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4293            && CONST_INT_P (XEXP (x, 1)))
4294     {
4295       type = ADDRESS_REG_UXTW;
4296       index = XEXP (XEXP (x, 0), 0);
4297       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4298       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4299         shift = -1;
4300     }
4301   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4302   else if ((GET_CODE (x) == SIGN_EXTRACT
4303             || GET_CODE (x) == ZERO_EXTRACT)
4304            && GET_MODE (x) == DImode
4305            && GET_CODE (XEXP (x, 0)) == ASHIFT
4306            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4307            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4308     {
4309       type = (GET_CODE (x) == SIGN_EXTRACT)
4310         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4311       index = XEXP (XEXP (x, 0), 0);
4312       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4313       if (INTVAL (XEXP (x, 1)) != 32 + shift
4314           || INTVAL (XEXP (x, 2)) != 0)
4315         shift = -1;
4316     }
4317   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4318      (const_int 0xffffffff<<shift)) */
4319   else if (GET_CODE (x) == AND
4320            && GET_MODE (x) == DImode
4321            && GET_CODE (XEXP (x, 0)) == ASHIFT
4322            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4323            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4324            && CONST_INT_P (XEXP (x, 1)))
4325     {
4326       type = ADDRESS_REG_UXTW;
4327       index = XEXP (XEXP (x, 0), 0);
4328       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4329       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4330         shift = -1;
4331     }
4332   /* (mult:P (reg:P) (const_int scale)) */
4333   else if (GET_CODE (x) == MULT
4334            && GET_MODE (x) == Pmode
4335            && GET_MODE (XEXP (x, 0)) == Pmode
4336            && CONST_INT_P (XEXP (x, 1)))
4337     {
4338       type = ADDRESS_REG_REG;
4339       index = XEXP (x, 0);
4340       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4341     }
4342   /* (ashift:P (reg:P) (const_int shift)) */
4343   else if (GET_CODE (x) == ASHIFT
4344            && GET_MODE (x) == Pmode
4345            && GET_MODE (XEXP (x, 0)) == Pmode
4346            && CONST_INT_P (XEXP (x, 1)))
4347     {
4348       type = ADDRESS_REG_REG;
4349       index = XEXP (x, 0);
4350       shift = INTVAL (XEXP (x, 1));
4351     }
4352   else
4353     return false;
4354
4355   if (!strict_p
4356       && GET_CODE (index) == SUBREG
4357       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4358     index = SUBREG_REG (index);
4359
4360   if ((shift == 0 ||
4361        (shift > 0 && shift <= 3
4362         && (1 << shift) == GET_MODE_SIZE (mode)))
4363       && REG_P (index)
4364       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4365     {
4366       info->type = type;
4367       info->offset = index;
4368       info->shift = shift;
4369       return true;
4370     }
4371
4372   return false;
4373 }
4374
4375 /* Return true if MODE is one of the modes for which we
4376    support LDP/STP operations.  */
4377
4378 static bool
4379 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4380 {
4381   return mode == SImode || mode == DImode
4382          || mode == SFmode || mode == DFmode
4383          || (aarch64_vector_mode_supported_p (mode)
4384              && GET_MODE_SIZE (mode) == 8);
4385 }
4386
4387 /* Return true if REGNO is a virtual pointer register, or an eliminable
4388    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4389    include stack_pointer or hard_frame_pointer.  */
4390 static bool
4391 virt_or_elim_regno_p (unsigned regno)
4392 {
4393   return ((regno >= FIRST_VIRTUAL_REGISTER
4394            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4395           || regno == FRAME_POINTER_REGNUM
4396           || regno == ARG_POINTER_REGNUM);
4397 }
4398
4399 /* Return true if X is a valid address for machine mode MODE.  If it is,
4400    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4401    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4402
4403 static bool
4404 aarch64_classify_address (struct aarch64_address_info *info,
4405                           rtx x, machine_mode mode,
4406                           RTX_CODE outer_code, bool strict_p)
4407 {
4408   enum rtx_code code = GET_CODE (x);
4409   rtx op0, op1;
4410
4411   /* On BE, we use load/store pair for all large int mode load/stores.
4412      TI/TFmode may also use a load/store pair.  */
4413   bool load_store_pair_p = (outer_code == PARALLEL
4414                             || mode == TImode
4415                             || mode == TFmode
4416                             || (BYTES_BIG_ENDIAN
4417                                 && aarch64_vect_struct_mode_p (mode)));
4418
4419   bool allow_reg_index_p =
4420     !load_store_pair_p
4421     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4422     && !aarch64_vect_struct_mode_p (mode);
4423
4424   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4425      REG addressing.  */
4426   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4427       && (code != POST_INC && code != REG))
4428     return false;
4429
4430   switch (code)
4431     {
4432     case REG:
4433     case SUBREG:
4434       info->type = ADDRESS_REG_IMM;
4435       info->base = x;
4436       info->offset = const0_rtx;
4437       return aarch64_base_register_rtx_p (x, strict_p);
4438
4439     case PLUS:
4440       op0 = XEXP (x, 0);
4441       op1 = XEXP (x, 1);
4442
4443       if (! strict_p
4444           && REG_P (op0)
4445           && virt_or_elim_regno_p (REGNO (op0))
4446           && CONST_INT_P (op1))
4447         {
4448           info->type = ADDRESS_REG_IMM;
4449           info->base = op0;
4450           info->offset = op1;
4451
4452           return true;
4453         }
4454
4455       if (GET_MODE_SIZE (mode) != 0
4456           && CONST_INT_P (op1)
4457           && aarch64_base_register_rtx_p (op0, strict_p))
4458         {
4459           HOST_WIDE_INT offset = INTVAL (op1);
4460
4461           info->type = ADDRESS_REG_IMM;
4462           info->base = op0;
4463           info->offset = op1;
4464
4465           /* TImode and TFmode values are allowed in both pairs of X
4466              registers and individual Q registers.  The available
4467              address modes are:
4468              X,X: 7-bit signed scaled offset
4469              Q:   9-bit signed offset
4470              We conservatively require an offset representable in either mode.
4471              When performing the check for pairs of X registers i.e.  LDP/STP
4472              pass down DImode since that is the natural size of the LDP/STP
4473              instruction memory accesses.  */
4474           if (mode == TImode || mode == TFmode)
4475             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4476                     && (offset_9bit_signed_unscaled_p (mode, offset)
4477                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4478
4479           /* A 7bit offset check because OImode will emit a ldp/stp
4480              instruction (only big endian will get here).
4481              For ldp/stp instructions, the offset is scaled for the size of a
4482              single element of the pair.  */
4483           if (mode == OImode)
4484             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4485
4486           /* Three 9/12 bit offsets checks because CImode will emit three
4487              ldr/str instructions (only big endian will get here).  */
4488           if (mode == CImode)
4489             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4490                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4491                         || offset_12bit_unsigned_scaled_p (V16QImode,
4492                                                            offset + 32)));
4493
4494           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4495              instructions (only big endian will get here).  */
4496           if (mode == XImode)
4497             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4498                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4499                                                             offset + 32));
4500
4501           if (load_store_pair_p)
4502             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4503                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4504           else
4505             return (offset_9bit_signed_unscaled_p (mode, offset)
4506                     || offset_12bit_unsigned_scaled_p (mode, offset));
4507         }
4508
4509       if (allow_reg_index_p)
4510         {
4511           /* Look for base + (scaled/extended) index register.  */
4512           if (aarch64_base_register_rtx_p (op0, strict_p)
4513               && aarch64_classify_index (info, op1, mode, strict_p))
4514             {
4515               info->base = op0;
4516               return true;
4517             }
4518           if (aarch64_base_register_rtx_p (op1, strict_p)
4519               && aarch64_classify_index (info, op0, mode, strict_p))
4520             {
4521               info->base = op1;
4522               return true;
4523             }
4524         }
4525
4526       return false;
4527
4528     case POST_INC:
4529     case POST_DEC:
4530     case PRE_INC:
4531     case PRE_DEC:
4532       info->type = ADDRESS_REG_WB;
4533       info->base = XEXP (x, 0);
4534       info->offset = NULL_RTX;
4535       return aarch64_base_register_rtx_p (info->base, strict_p);
4536
4537     case POST_MODIFY:
4538     case PRE_MODIFY:
4539       info->type = ADDRESS_REG_WB;
4540       info->base = XEXP (x, 0);
4541       if (GET_CODE (XEXP (x, 1)) == PLUS
4542           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4543           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4544           && aarch64_base_register_rtx_p (info->base, strict_p))
4545         {
4546           HOST_WIDE_INT offset;
4547           info->offset = XEXP (XEXP (x, 1), 1);
4548           offset = INTVAL (info->offset);
4549
4550           /* TImode and TFmode values are allowed in both pairs of X
4551              registers and individual Q registers.  The available
4552              address modes are:
4553              X,X: 7-bit signed scaled offset
4554              Q:   9-bit signed offset
4555              We conservatively require an offset representable in either mode.
4556            */
4557           if (mode == TImode || mode == TFmode)
4558             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4559                     && offset_9bit_signed_unscaled_p (mode, offset));
4560
4561           if (load_store_pair_p)
4562             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4563                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4564           else
4565             return offset_9bit_signed_unscaled_p (mode, offset);
4566         }
4567       return false;
4568
4569     case CONST:
4570     case SYMBOL_REF:
4571     case LABEL_REF:
4572       /* load literal: pc-relative constant pool entry.  Only supported
4573          for SI mode or larger.  */
4574       info->type = ADDRESS_SYMBOLIC;
4575
4576       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4577         {
4578           rtx sym, addend;
4579
4580           split_const (x, &sym, &addend);
4581           return ((GET_CODE (sym) == LABEL_REF
4582                    || (GET_CODE (sym) == SYMBOL_REF
4583                        && CONSTANT_POOL_ADDRESS_P (sym)
4584                        && aarch64_pcrelative_literal_loads)));
4585         }
4586       return false;
4587
4588     case LO_SUM:
4589       info->type = ADDRESS_LO_SUM;
4590       info->base = XEXP (x, 0);
4591       info->offset = XEXP (x, 1);
4592       if (allow_reg_index_p
4593           && aarch64_base_register_rtx_p (info->base, strict_p))
4594         {
4595           rtx sym, offs;
4596           split_const (info->offset, &sym, &offs);
4597           if (GET_CODE (sym) == SYMBOL_REF
4598               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4599             {
4600               /* The symbol and offset must be aligned to the access size.  */
4601               unsigned int align;
4602               unsigned int ref_size;
4603
4604               if (CONSTANT_POOL_ADDRESS_P (sym))
4605                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4606               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4607                 {
4608                   tree exp = SYMBOL_REF_DECL (sym);
4609                   align = TYPE_ALIGN (TREE_TYPE (exp));
4610                   align = CONSTANT_ALIGNMENT (exp, align);
4611                 }
4612               else if (SYMBOL_REF_DECL (sym))
4613                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4614               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4615                        && SYMBOL_REF_BLOCK (sym) != NULL)
4616                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4617               else
4618                 align = BITS_PER_UNIT;
4619
4620               ref_size = GET_MODE_SIZE (mode);
4621               if (ref_size == 0)
4622                 ref_size = GET_MODE_SIZE (DImode);
4623
4624               return ((INTVAL (offs) & (ref_size - 1)) == 0
4625                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4626             }
4627         }
4628       return false;
4629
4630     default:
4631       return false;
4632     }
4633 }
4634
4635 /* Return true if the address X is valid for a PRFM instruction.
4636    STRICT_P is true if we should do strict checking with
4637    aarch64_classify_address.  */
4638
4639 bool
4640 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4641 {
4642   struct aarch64_address_info addr;
4643
4644   /* PRFM accepts the same addresses as DImode...  */
4645   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4646   if (!res)
4647     return false;
4648
4649   /* ... except writeback forms.  */
4650   return addr.type != ADDRESS_REG_WB;
4651 }
4652
4653 bool
4654 aarch64_symbolic_address_p (rtx x)
4655 {
4656   rtx offset;
4657
4658   split_const (x, &x, &offset);
4659   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4660 }
4661
4662 /* Classify the base of symbolic expression X.  */
4663
4664 enum aarch64_symbol_type
4665 aarch64_classify_symbolic_expression (rtx x)
4666 {
4667   rtx offset;
4668
4669   split_const (x, &x, &offset);
4670   return aarch64_classify_symbol (x, offset);
4671 }
4672
4673
4674 /* Return TRUE if X is a legitimate address for accessing memory in
4675    mode MODE.  */
4676 static bool
4677 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4678 {
4679   struct aarch64_address_info addr;
4680
4681   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4682 }
4683
4684 /* Return TRUE if X is a legitimate address for accessing memory in
4685    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4686    pair operation.  */
4687 bool
4688 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4689                               RTX_CODE outer_code, bool strict_p)
4690 {
4691   struct aarch64_address_info addr;
4692
4693   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4694 }
4695
4696 /* Split an out-of-range address displacement into a base and offset.
4697    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4698    to increase opportunities for sharing the base address of different sizes.
4699    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4700 static bool
4701 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4702 {
4703   HOST_WIDE_INT offset = INTVAL (*disp);
4704   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4705
4706   if (mode == TImode || mode == TFmode
4707       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4708     base = (offset + 0x100) & ~0x1ff;
4709
4710   *off = GEN_INT (base);
4711   *disp = GEN_INT (offset - base);
4712   return true;
4713 }
4714
4715 /* Return the binary representation of floating point constant VALUE in INTVAL.
4716    If the value cannot be converted, return false without setting INTVAL.
4717    The conversion is done in the given MODE.  */
4718 bool
4719 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4720 {
4721
4722   /* We make a general exception for 0.  */
4723   if (aarch64_float_const_zero_rtx_p (value))
4724     {
4725       *intval = 0;
4726       return true;
4727     }
4728
4729   machine_mode mode = GET_MODE (value);
4730   if (GET_CODE (value) != CONST_DOUBLE
4731       || !SCALAR_FLOAT_MODE_P (mode)
4732       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4733       /* Only support up to DF mode.  */
4734       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4735     return false;
4736
4737   unsigned HOST_WIDE_INT ival = 0;
4738
4739   long res[2];
4740   real_to_target (res,
4741                   CONST_DOUBLE_REAL_VALUE (value),
4742                   REAL_MODE_FORMAT (mode));
4743
4744   if (mode == DFmode)
4745     {
4746       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4747       ival = zext_hwi (res[order], 32);
4748       ival |= (zext_hwi (res[1 - order], 32) << 32);
4749     }
4750   else
4751       ival = zext_hwi (res[0], 32);
4752
4753   *intval = ival;
4754   return true;
4755 }
4756
4757 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4758    single MOV(+MOVK) followed by an FMOV.  */
4759 bool
4760 aarch64_float_const_rtx_p (rtx x)
4761 {
4762   machine_mode mode = GET_MODE (x);
4763   if (mode == VOIDmode)
4764     return false;
4765
4766   /* Determine whether it's cheaper to write float constants as
4767      mov/movk pairs over ldr/adrp pairs.  */
4768   unsigned HOST_WIDE_INT ival;
4769
4770   if (GET_CODE (x) == CONST_DOUBLE
4771       && SCALAR_FLOAT_MODE_P (mode)
4772       && aarch64_reinterpret_float_as_int (x, &ival))
4773     {
4774       machine_mode imode = (mode == HFmode
4775                             ? SImode
4776                             : int_mode_for_mode (mode).require ());
4777       int num_instr = aarch64_internal_mov_immediate
4778                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4779       return num_instr < 3;
4780     }
4781
4782   return false;
4783 }
4784
4785 /* Return TRUE if rtx X is immediate constant 0.0 */
4786 bool
4787 aarch64_float_const_zero_rtx_p (rtx x)
4788 {
4789   if (GET_MODE (x) == VOIDmode)
4790     return false;
4791
4792   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4793     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4794   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4795 }
4796
4797 /* Return TRUE if rtx X is immediate constant that fits in a single
4798    MOVI immediate operation.  */
4799 bool
4800 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4801 {
4802   if (!TARGET_SIMD)
4803      return false;
4804
4805   machine_mode vmode, imode;
4806   unsigned HOST_WIDE_INT ival;
4807
4808   if (GET_CODE (x) == CONST_DOUBLE
4809       && SCALAR_FLOAT_MODE_P (mode))
4810     {
4811       if (!aarch64_reinterpret_float_as_int (x, &ival))
4812         return false;
4813
4814       /* We make a general exception for 0.  */
4815       if (aarch64_float_const_zero_rtx_p (x))
4816         return true;
4817
4818       imode = int_mode_for_mode (mode).require ();
4819     }
4820   else if (GET_CODE (x) == CONST_INT
4821            && SCALAR_INT_MODE_P (mode))
4822     {
4823        imode = mode;
4824        ival = INTVAL (x);
4825     }
4826   else
4827     return false;
4828
4829    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4830      a 128 bit vector mode.  */
4831   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4832
4833   vmode = aarch64_simd_container_mode (imode, width);
4834   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4835
4836   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4837 }
4838
4839
4840 /* Return the fixed registers used for condition codes.  */
4841
4842 static bool
4843 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4844 {
4845   *p1 = CC_REGNUM;
4846   *p2 = INVALID_REGNUM;
4847   return true;
4848 }
4849
4850 /* This function is used by the call expanders of the machine description.
4851    RESULT is the register in which the result is returned.  It's NULL for
4852    "call" and "sibcall".
4853    MEM is the location of the function call.
4854    SIBCALL indicates whether this function call is normal call or sibling call.
4855    It will generate different pattern accordingly.  */
4856
4857 void
4858 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4859 {
4860   rtx call, callee, tmp;
4861   rtvec vec;
4862   machine_mode mode;
4863
4864   gcc_assert (MEM_P (mem));
4865   callee = XEXP (mem, 0);
4866   mode = GET_MODE (callee);
4867   gcc_assert (mode == Pmode);
4868
4869   /* Decide if we should generate indirect calls by loading the
4870      address of the callee into a register before performing
4871      the branch-and-link.  */
4872   if (SYMBOL_REF_P (callee)
4873       ? (aarch64_is_long_call_p (callee)
4874          || aarch64_is_noplt_call_p (callee))
4875       : !REG_P (callee))
4876     XEXP (mem, 0) = force_reg (mode, callee);
4877
4878   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4879
4880   if (result != NULL_RTX)
4881     call = gen_rtx_SET (result, call);
4882
4883   if (sibcall)
4884     tmp = ret_rtx;
4885   else
4886     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4887
4888   vec = gen_rtvec (2, call, tmp);
4889   call = gen_rtx_PARALLEL (VOIDmode, vec);
4890
4891   aarch64_emit_call_insn (call);
4892 }
4893
4894 /* Emit call insn with PAT and do aarch64-specific handling.  */
4895
4896 void
4897 aarch64_emit_call_insn (rtx pat)
4898 {
4899   rtx insn = emit_call_insn (pat);
4900
4901   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4902   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4903   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4904 }
4905
4906 machine_mode
4907 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4908 {
4909   /* All floating point compares return CCFP if it is an equality
4910      comparison, and CCFPE otherwise.  */
4911   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4912     {
4913       switch (code)
4914         {
4915         case EQ:
4916         case NE:
4917         case UNORDERED:
4918         case ORDERED:
4919         case UNLT:
4920         case UNLE:
4921         case UNGT:
4922         case UNGE:
4923         case UNEQ:
4924         case LTGT:
4925           return CCFPmode;
4926
4927         case LT:
4928         case LE:
4929         case GT:
4930         case GE:
4931           return CCFPEmode;
4932
4933         default:
4934           gcc_unreachable ();
4935         }
4936     }
4937
4938   /* Equality comparisons of short modes against zero can be performed
4939      using the TST instruction with the appropriate bitmask.  */
4940   if (y == const0_rtx && REG_P (x)
4941       && (code == EQ || code == NE)
4942       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4943     return CC_NZmode;
4944
4945   /* Similarly, comparisons of zero_extends from shorter modes can
4946      be performed using an ANDS with an immediate mask.  */
4947   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4948       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4949       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4950       && (code == EQ || code == NE))
4951     return CC_NZmode;
4952
4953   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4954       && y == const0_rtx
4955       && (code == EQ || code == NE || code == LT || code == GE)
4956       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4957           || GET_CODE (x) == NEG
4958           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4959               && CONST_INT_P (XEXP (x, 2)))))
4960     return CC_NZmode;
4961
4962   /* A compare with a shifted operand.  Because of canonicalization,
4963      the comparison will have to be swapped when we emit the assembly
4964      code.  */
4965   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4966       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4967       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4968           || GET_CODE (x) == LSHIFTRT
4969           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4970     return CC_SWPmode;
4971
4972   /* Similarly for a negated operand, but we can only do this for
4973      equalities.  */
4974   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4975       && (REG_P (y) || GET_CODE (y) == SUBREG)
4976       && (code == EQ || code == NE)
4977       && GET_CODE (x) == NEG)
4978     return CC_Zmode;
4979
4980   /* A test for unsigned overflow.  */
4981   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4982       && code == NE
4983       && GET_CODE (x) == PLUS
4984       && GET_CODE (y) == ZERO_EXTEND)
4985     return CC_Cmode;
4986
4987   /* For everything else, return CCmode.  */
4988   return CCmode;
4989 }
4990
4991 static int
4992 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4993
4994 int
4995 aarch64_get_condition_code (rtx x)
4996 {
4997   machine_mode mode = GET_MODE (XEXP (x, 0));
4998   enum rtx_code comp_code = GET_CODE (x);
4999
5000   if (GET_MODE_CLASS (mode) != MODE_CC)
5001     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5002   return aarch64_get_condition_code_1 (mode, comp_code);
5003 }
5004
5005 static int
5006 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5007 {
5008   switch (mode)
5009     {
5010     case E_CCFPmode:
5011     case E_CCFPEmode:
5012       switch (comp_code)
5013         {
5014         case GE: return AARCH64_GE;
5015         case GT: return AARCH64_GT;
5016         case LE: return AARCH64_LS;
5017         case LT: return AARCH64_MI;
5018         case NE: return AARCH64_NE;
5019         case EQ: return AARCH64_EQ;
5020         case ORDERED: return AARCH64_VC;
5021         case UNORDERED: return AARCH64_VS;
5022         case UNLT: return AARCH64_LT;
5023         case UNLE: return AARCH64_LE;
5024         case UNGT: return AARCH64_HI;
5025         case UNGE: return AARCH64_PL;
5026         default: return -1;
5027         }
5028       break;
5029
5030     case E_CCmode:
5031       switch (comp_code)
5032         {
5033         case NE: return AARCH64_NE;
5034         case EQ: return AARCH64_EQ;
5035         case GE: return AARCH64_GE;
5036         case GT: return AARCH64_GT;
5037         case LE: return AARCH64_LE;
5038         case LT: return AARCH64_LT;
5039         case GEU: return AARCH64_CS;
5040         case GTU: return AARCH64_HI;
5041         case LEU: return AARCH64_LS;
5042         case LTU: return AARCH64_CC;
5043         default: return -1;
5044         }
5045       break;
5046
5047     case E_CC_SWPmode:
5048       switch (comp_code)
5049         {
5050         case NE: return AARCH64_NE;
5051         case EQ: return AARCH64_EQ;
5052         case GE: return AARCH64_LE;
5053         case GT: return AARCH64_LT;
5054         case LE: return AARCH64_GE;
5055         case LT: return AARCH64_GT;
5056         case GEU: return AARCH64_LS;
5057         case GTU: return AARCH64_CC;
5058         case LEU: return AARCH64_CS;
5059         case LTU: return AARCH64_HI;
5060         default: return -1;
5061         }
5062       break;
5063
5064     case E_CC_NZmode:
5065       switch (comp_code)
5066         {
5067         case NE: return AARCH64_NE;
5068         case EQ: return AARCH64_EQ;
5069         case GE: return AARCH64_PL;
5070         case LT: return AARCH64_MI;
5071         default: return -1;
5072         }
5073       break;
5074
5075     case E_CC_Zmode:
5076       switch (comp_code)
5077         {
5078         case NE: return AARCH64_NE;
5079         case EQ: return AARCH64_EQ;
5080         default: return -1;
5081         }
5082       break;
5083
5084     case E_CC_Cmode:
5085       switch (comp_code)
5086         {
5087         case NE: return AARCH64_CS;
5088         case EQ: return AARCH64_CC;
5089         default: return -1;
5090         }
5091       break;
5092
5093     default:
5094       return -1;
5095     }
5096
5097   return -1;
5098 }
5099
5100 bool
5101 aarch64_const_vec_all_same_in_range_p (rtx x,
5102                                   HOST_WIDE_INT minval,
5103                                   HOST_WIDE_INT maxval)
5104 {
5105   HOST_WIDE_INT firstval;
5106   int count, i;
5107
5108   if (GET_CODE (x) != CONST_VECTOR
5109       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5110     return false;
5111
5112   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5113   if (firstval < minval || firstval > maxval)
5114     return false;
5115
5116   count = CONST_VECTOR_NUNITS (x);
5117   for (i = 1; i < count; i++)
5118     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5119       return false;
5120
5121   return true;
5122 }
5123
5124 bool
5125 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5126 {
5127   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5128 }
5129
5130
5131 /* N Z C V.  */
5132 #define AARCH64_CC_V 1
5133 #define AARCH64_CC_C (1 << 1)
5134 #define AARCH64_CC_Z (1 << 2)
5135 #define AARCH64_CC_N (1 << 3)
5136
5137 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5138 static const int aarch64_nzcv_codes[] =
5139 {
5140   0,            /* EQ, Z == 1.  */
5141   AARCH64_CC_Z, /* NE, Z == 0.  */
5142   0,            /* CS, C == 1.  */
5143   AARCH64_CC_C, /* CC, C == 0.  */
5144   0,            /* MI, N == 1.  */
5145   AARCH64_CC_N, /* PL, N == 0.  */
5146   0,            /* VS, V == 1.  */
5147   AARCH64_CC_V, /* VC, V == 0.  */
5148   0,            /* HI, C ==1 && Z == 0.  */
5149   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5150   AARCH64_CC_V, /* GE, N == V.  */
5151   0,            /* LT, N != V.  */
5152   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5153   0,            /* LE, !(Z == 0 && N == V).  */
5154   0,            /* AL, Any.  */
5155   0             /* NV, Any.  */
5156 };
5157
5158 /* Print operand X to file F in a target specific manner according to CODE.
5159    The acceptable formatting commands given by CODE are:
5160      'c':               An integer or symbol address without a preceding #
5161                         sign.
5162      'e':               Print the sign/zero-extend size as a character 8->b,
5163                         16->h, 32->w.
5164      'p':               Prints N such that 2^N == X (X must be power of 2 and
5165                         const int).
5166      'P':               Print the number of non-zero bits in X (a const_int).
5167      'H':               Print the higher numbered register of a pair (TImode)
5168                         of regs.
5169      'm':               Print a condition (eq, ne, etc).
5170      'M':               Same as 'm', but invert condition.
5171      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5172      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5173                         The register printed is the FP/SIMD register name
5174                         of X + 0/1/2/3 for S/T/U/V.
5175      'R':               Print a scalar FP/SIMD register name + 1.
5176      'X':               Print bottom 16 bits of integer constant in hex.
5177      'w/x':             Print a general register name or the zero register
5178                         (32-bit or 64-bit).
5179      '0':               Print a normal operand, if it's a general register,
5180                         then we assume DImode.
5181      'k':               Print NZCV for conditional compare instructions.
5182      'A':               Output address constant representing the first
5183                         argument of X, specifying a relocation offset
5184                         if appropriate.
5185      'L':               Output constant address specified by X
5186                         with a relocation offset if appropriate.
5187      'G':               Prints address of X, specifying a PC relative
5188                         relocation mode if appropriate.  */
5189
5190 static void
5191 aarch64_print_operand (FILE *f, rtx x, int code)
5192 {
5193   switch (code)
5194     {
5195     case 'c':
5196       switch (GET_CODE (x))
5197         {
5198         case CONST_INT:
5199           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5200           break;
5201
5202         case SYMBOL_REF:
5203           output_addr_const (f, x);
5204           break;
5205
5206         case CONST:
5207           if (GET_CODE (XEXP (x, 0)) == PLUS
5208               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5209             {
5210               output_addr_const (f, x);
5211               break;
5212             }
5213           /* Fall through.  */
5214
5215         default:
5216           output_operand_lossage ("Unsupported operand for code '%c'", code);
5217         }
5218       break;
5219
5220     case 'e':
5221       {
5222         int n;
5223
5224         if (!CONST_INT_P (x)
5225             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5226           {
5227             output_operand_lossage ("invalid operand for '%%%c'", code);
5228             return;
5229           }
5230
5231         switch (n)
5232           {
5233           case 3:
5234             fputc ('b', f);
5235             break;
5236           case 4:
5237             fputc ('h', f);
5238             break;
5239           case 5:
5240             fputc ('w', f);
5241             break;
5242           default:
5243             output_operand_lossage ("invalid operand for '%%%c'", code);
5244             return;
5245           }
5246       }
5247       break;
5248
5249     case 'p':
5250       {
5251         int n;
5252
5253         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5254           {
5255             output_operand_lossage ("invalid operand for '%%%c'", code);
5256             return;
5257           }
5258
5259         asm_fprintf (f, "%d", n);
5260       }
5261       break;
5262
5263     case 'P':
5264       if (!CONST_INT_P (x))
5265         {
5266           output_operand_lossage ("invalid operand for '%%%c'", code);
5267           return;
5268         }
5269
5270       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5271       break;
5272
5273     case 'H':
5274       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5275         {
5276           output_operand_lossage ("invalid operand for '%%%c'", code);
5277           return;
5278         }
5279
5280       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5281       break;
5282
5283     case 'M':
5284     case 'm':
5285       {
5286         int cond_code;
5287         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5288         if (x == const_true_rtx)
5289           {
5290             if (code == 'M')
5291               fputs ("nv", f);
5292             return;
5293           }
5294
5295         if (!COMPARISON_P (x))
5296           {
5297             output_operand_lossage ("invalid operand for '%%%c'", code);
5298             return;
5299           }
5300
5301         cond_code = aarch64_get_condition_code (x);
5302         gcc_assert (cond_code >= 0);
5303         if (code == 'M')
5304           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5305         fputs (aarch64_condition_codes[cond_code], f);
5306       }
5307       break;
5308
5309     case 'b':
5310     case 'h':
5311     case 's':
5312     case 'd':
5313     case 'q':
5314       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5315         {
5316           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5317           return;
5318         }
5319       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5320       break;
5321
5322     case 'S':
5323     case 'T':
5324     case 'U':
5325     case 'V':
5326       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5327         {
5328           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5329           return;
5330         }
5331       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5332       break;
5333
5334     case 'R':
5335       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5336         {
5337           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5338           return;
5339         }
5340       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5341       break;
5342
5343     case 'X':
5344       if (!CONST_INT_P (x))
5345         {
5346           output_operand_lossage ("invalid operand for '%%%c'", code);
5347           return;
5348         }
5349       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5350       break;
5351
5352     case 'w':
5353     case 'x':
5354       if (x == const0_rtx
5355           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5356         {
5357           asm_fprintf (f, "%czr", code);
5358           break;
5359         }
5360
5361       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5362         {
5363           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5364           break;
5365         }
5366
5367       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5368         {
5369           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5370           break;
5371         }
5372
5373       /* Fall through */
5374
5375     case 0:
5376       if (x == NULL)
5377         {
5378           output_operand_lossage ("missing operand");
5379           return;
5380         }
5381
5382       switch (GET_CODE (x))
5383         {
5384         case REG:
5385           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5386           break;
5387
5388         case MEM:
5389           output_address (GET_MODE (x), XEXP (x, 0));
5390           /* Check all memory references are Pmode - even with ILP32.  */
5391           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5392           break;
5393
5394         case CONST:
5395         case LABEL_REF:
5396         case SYMBOL_REF:
5397           output_addr_const (asm_out_file, x);
5398           break;
5399
5400         case CONST_INT:
5401           asm_fprintf (f, "%wd", INTVAL (x));
5402           break;
5403
5404         case CONST_VECTOR:
5405           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5406             {
5407               gcc_assert (
5408                   aarch64_const_vec_all_same_in_range_p (x,
5409                                                          HOST_WIDE_INT_MIN,
5410                                                          HOST_WIDE_INT_MAX));
5411               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5412             }
5413           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5414             {
5415               fputc ('0', f);
5416             }
5417           else
5418             gcc_unreachable ();
5419           break;
5420
5421         case CONST_DOUBLE:
5422           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5423              be getting CONST_DOUBLEs holding integers.  */
5424           gcc_assert (GET_MODE (x) != VOIDmode);
5425           if (aarch64_float_const_zero_rtx_p (x))
5426             {
5427               fputc ('0', f);
5428               break;
5429             }
5430           else if (aarch64_float_const_representable_p (x))
5431             {
5432 #define buf_size 20
5433               char float_buf[buf_size] = {'\0'};
5434               real_to_decimal_for_mode (float_buf,
5435                                         CONST_DOUBLE_REAL_VALUE (x),
5436                                         buf_size, buf_size,
5437                                         1, GET_MODE (x));
5438               asm_fprintf (asm_out_file, "%s", float_buf);
5439               break;
5440 #undef buf_size
5441             }
5442           output_operand_lossage ("invalid constant");
5443           return;
5444         default:
5445           output_operand_lossage ("invalid operand");
5446           return;
5447         }
5448       break;
5449
5450     case 'A':
5451       if (GET_CODE (x) == HIGH)
5452         x = XEXP (x, 0);
5453
5454       switch (aarch64_classify_symbolic_expression (x))
5455         {
5456         case SYMBOL_SMALL_GOT_4G:
5457           asm_fprintf (asm_out_file, ":got:");
5458           break;
5459
5460         case SYMBOL_SMALL_TLSGD:
5461           asm_fprintf (asm_out_file, ":tlsgd:");
5462           break;
5463
5464         case SYMBOL_SMALL_TLSDESC:
5465           asm_fprintf (asm_out_file, ":tlsdesc:");
5466           break;
5467
5468         case SYMBOL_SMALL_TLSIE:
5469           asm_fprintf (asm_out_file, ":gottprel:");
5470           break;
5471
5472         case SYMBOL_TLSLE24:
5473           asm_fprintf (asm_out_file, ":tprel:");
5474           break;
5475
5476         case SYMBOL_TINY_GOT:
5477           gcc_unreachable ();
5478           break;
5479
5480         default:
5481           break;
5482         }
5483       output_addr_const (asm_out_file, x);
5484       break;
5485
5486     case 'L':
5487       switch (aarch64_classify_symbolic_expression (x))
5488         {
5489         case SYMBOL_SMALL_GOT_4G:
5490           asm_fprintf (asm_out_file, ":lo12:");
5491           break;
5492
5493         case SYMBOL_SMALL_TLSGD:
5494           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5495           break;
5496
5497         case SYMBOL_SMALL_TLSDESC:
5498           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5499           break;
5500
5501         case SYMBOL_SMALL_TLSIE:
5502           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5503           break;
5504
5505         case SYMBOL_TLSLE12:
5506           asm_fprintf (asm_out_file, ":tprel_lo12:");
5507           break;
5508
5509         case SYMBOL_TLSLE24:
5510           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5511           break;
5512
5513         case SYMBOL_TINY_GOT:
5514           asm_fprintf (asm_out_file, ":got:");
5515           break;
5516
5517         case SYMBOL_TINY_TLSIE:
5518           asm_fprintf (asm_out_file, ":gottprel:");
5519           break;
5520
5521         default:
5522           break;
5523         }
5524       output_addr_const (asm_out_file, x);
5525       break;
5526
5527     case 'G':
5528       switch (aarch64_classify_symbolic_expression (x))
5529         {
5530         case SYMBOL_TLSLE24:
5531           asm_fprintf (asm_out_file, ":tprel_hi12:");
5532           break;
5533         default:
5534           break;
5535         }
5536       output_addr_const (asm_out_file, x);
5537       break;
5538
5539     case 'k':
5540       {
5541         HOST_WIDE_INT cond_code;
5542
5543         if (!CONST_INT_P (x))
5544           {
5545             output_operand_lossage ("invalid operand for '%%%c'", code);
5546             return;
5547           }
5548
5549         cond_code = INTVAL (x);
5550         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5551         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5552       }
5553       break;
5554
5555     default:
5556       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5557       return;
5558     }
5559 }
5560
5561 static void
5562 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5563 {
5564   struct aarch64_address_info addr;
5565
5566   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5567     switch (addr.type)
5568       {
5569       case ADDRESS_REG_IMM:
5570         if (addr.offset == const0_rtx)
5571           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5572         else
5573           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5574                        INTVAL (addr.offset));
5575         return;
5576
5577       case ADDRESS_REG_REG:
5578         if (addr.shift == 0)
5579           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5580                        reg_names [REGNO (addr.offset)]);
5581         else
5582           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5583                        reg_names [REGNO (addr.offset)], addr.shift);
5584         return;
5585
5586       case ADDRESS_REG_UXTW:
5587         if (addr.shift == 0)
5588           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5589                        REGNO (addr.offset) - R0_REGNUM);
5590         else
5591           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5592                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5593         return;
5594
5595       case ADDRESS_REG_SXTW:
5596         if (addr.shift == 0)
5597           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5598                        REGNO (addr.offset) - R0_REGNUM);
5599         else
5600           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5601                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5602         return;
5603
5604       case ADDRESS_REG_WB:
5605         switch (GET_CODE (x))
5606           {
5607           case PRE_INC:
5608             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5609                          GET_MODE_SIZE (mode));
5610             return;
5611           case POST_INC:
5612             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5613                          GET_MODE_SIZE (mode));
5614             return;
5615           case PRE_DEC:
5616             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5617                          GET_MODE_SIZE (mode));
5618             return;
5619           case POST_DEC:
5620             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5621                          GET_MODE_SIZE (mode));
5622             return;
5623           case PRE_MODIFY:
5624             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5625                          INTVAL (addr.offset));
5626             return;
5627           case POST_MODIFY:
5628             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5629                          INTVAL (addr.offset));
5630             return;
5631           default:
5632             break;
5633           }
5634         break;
5635
5636       case ADDRESS_LO_SUM:
5637         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5638         output_addr_const (f, addr.offset);
5639         asm_fprintf (f, "]");
5640         return;
5641
5642       case ADDRESS_SYMBOLIC:
5643         break;
5644       }
5645
5646   output_addr_const (f, x);
5647 }
5648
5649 bool
5650 aarch64_label_mentioned_p (rtx x)
5651 {
5652   const char *fmt;
5653   int i;
5654
5655   if (GET_CODE (x) == LABEL_REF)
5656     return true;
5657
5658   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5659      referencing instruction, but they are constant offsets, not
5660      symbols.  */
5661   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5662     return false;
5663
5664   fmt = GET_RTX_FORMAT (GET_CODE (x));
5665   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5666     {
5667       if (fmt[i] == 'E')
5668         {
5669           int j;
5670
5671           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5672             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5673               return 1;
5674         }
5675       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5676         return 1;
5677     }
5678
5679   return 0;
5680 }
5681
5682 /* Implement REGNO_REG_CLASS.  */
5683
5684 enum reg_class
5685 aarch64_regno_regclass (unsigned regno)
5686 {
5687   if (GP_REGNUM_P (regno))
5688     return GENERAL_REGS;
5689
5690   if (regno == SP_REGNUM)
5691     return STACK_REG;
5692
5693   if (regno == FRAME_POINTER_REGNUM
5694       || regno == ARG_POINTER_REGNUM)
5695     return POINTER_REGS;
5696
5697   if (FP_REGNUM_P (regno))
5698     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5699
5700   return NO_REGS;
5701 }
5702
5703 static rtx
5704 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5705 {
5706   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5707      where mask is selected by alignment and size of the offset.
5708      We try to pick as large a range for the offset as possible to
5709      maximize the chance of a CSE.  However, for aligned addresses
5710      we limit the range to 4k so that structures with different sized
5711      elements are likely to use the same base.  We need to be careful
5712      not to split a CONST for some forms of address expression, otherwise
5713      it will generate sub-optimal code.  */
5714
5715   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5716     {
5717       rtx base = XEXP (x, 0);
5718       rtx offset_rtx = XEXP (x, 1);
5719       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5720
5721       if (GET_CODE (base) == PLUS)
5722         {
5723           rtx op0 = XEXP (base, 0);
5724           rtx op1 = XEXP (base, 1);
5725
5726           /* Force any scaling into a temp for CSE.  */
5727           op0 = force_reg (Pmode, op0);
5728           op1 = force_reg (Pmode, op1);
5729
5730           /* Let the pointer register be in op0.  */
5731           if (REG_POINTER (op1))
5732             std::swap (op0, op1);
5733
5734           /* If the pointer is virtual or frame related, then we know that
5735              virtual register instantiation or register elimination is going
5736              to apply a second constant.  We want the two constants folded
5737              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5738           if (virt_or_elim_regno_p (REGNO (op0)))
5739             {
5740               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5741                                    NULL_RTX, true, OPTAB_DIRECT);
5742               return gen_rtx_PLUS (Pmode, base, op1);
5743             }
5744
5745           /* Otherwise, in order to encourage CSE (and thence loop strength
5746              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5747           base = expand_binop (Pmode, add_optab, op0, op1,
5748                                NULL_RTX, true, OPTAB_DIRECT);
5749           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5750         }
5751
5752       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5753       HOST_WIDE_INT base_offset;
5754       if (GET_MODE_SIZE (mode) > 16)
5755         base_offset = (offset + 0x400) & ~0x7f0;
5756       /* For offsets aren't a multiple of the access size, the limit is
5757          -256...255.  */
5758       else if (offset & (GET_MODE_SIZE (mode) - 1))
5759         {
5760           base_offset = (offset + 0x100) & ~0x1ff;
5761
5762           /* BLKmode typically uses LDP of X-registers.  */
5763           if (mode == BLKmode)
5764             base_offset = (offset + 512) & ~0x3ff;
5765         }
5766       /* Small negative offsets are supported.  */
5767       else if (IN_RANGE (offset, -256, 0))
5768         base_offset = 0;
5769       else if (mode == TImode || mode == TFmode)
5770         base_offset = (offset + 0x100) & ~0x1ff;
5771       /* Use 12-bit offset by access size.  */
5772       else
5773         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5774
5775       if (base_offset != 0)
5776         {
5777           base = plus_constant (Pmode, base, base_offset);
5778           base = force_operand (base, NULL_RTX);
5779           return plus_constant (Pmode, base, offset - base_offset);
5780         }
5781     }
5782
5783   return x;
5784 }
5785
5786 /* Return the reload icode required for a constant pool in mode.  */
5787 static enum insn_code
5788 aarch64_constant_pool_reload_icode (machine_mode mode)
5789 {
5790   switch (mode)
5791     {
5792     case E_SFmode:
5793       return CODE_FOR_aarch64_reload_movcpsfdi;
5794
5795     case E_DFmode:
5796       return CODE_FOR_aarch64_reload_movcpdfdi;
5797
5798     case E_TFmode:
5799       return CODE_FOR_aarch64_reload_movcptfdi;
5800
5801     case E_V8QImode:
5802       return CODE_FOR_aarch64_reload_movcpv8qidi;
5803
5804     case E_V16QImode:
5805       return CODE_FOR_aarch64_reload_movcpv16qidi;
5806
5807     case E_V4HImode:
5808       return CODE_FOR_aarch64_reload_movcpv4hidi;
5809
5810     case E_V8HImode:
5811       return CODE_FOR_aarch64_reload_movcpv8hidi;
5812
5813     case E_V2SImode:
5814       return CODE_FOR_aarch64_reload_movcpv2sidi;
5815
5816     case E_V4SImode:
5817       return CODE_FOR_aarch64_reload_movcpv4sidi;
5818
5819     case E_V2DImode:
5820       return CODE_FOR_aarch64_reload_movcpv2didi;
5821
5822     case E_V2DFmode:
5823       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5824
5825     default:
5826       gcc_unreachable ();
5827     }
5828
5829   gcc_unreachable ();
5830 }
5831 static reg_class_t
5832 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5833                           reg_class_t rclass,
5834                           machine_mode mode,
5835                           secondary_reload_info *sri)
5836 {
5837
5838   /* If we have to disable direct literal pool loads and stores because the
5839      function is too big, then we need a scratch register.  */
5840   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5841       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5842           || targetm.vector_mode_supported_p (GET_MODE (x)))
5843       && !aarch64_pcrelative_literal_loads)
5844     {
5845       sri->icode = aarch64_constant_pool_reload_icode (mode);
5846       return NO_REGS;
5847     }
5848
5849   /* Without the TARGET_SIMD instructions we cannot move a Q register
5850      to a Q register directly.  We need a scratch.  */
5851   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5852       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5853       && reg_class_subset_p (rclass, FP_REGS))
5854     {
5855       if (mode == TFmode)
5856         sri->icode = CODE_FOR_aarch64_reload_movtf;
5857       else if (mode == TImode)
5858         sri->icode = CODE_FOR_aarch64_reload_movti;
5859       return NO_REGS;
5860     }
5861
5862   /* A TFmode or TImode memory access should be handled via an FP_REGS
5863      because AArch64 has richer addressing modes for LDR/STR instructions
5864      than LDP/STP instructions.  */
5865   if (TARGET_FLOAT && rclass == GENERAL_REGS
5866       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5867     return FP_REGS;
5868
5869   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5870       return GENERAL_REGS;
5871
5872   return NO_REGS;
5873 }
5874
5875 static bool
5876 aarch64_can_eliminate (const int from, const int to)
5877 {
5878   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5879      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5880
5881   if (frame_pointer_needed)
5882     {
5883       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5884         return true;
5885       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5886         return false;
5887       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5888           && !cfun->calls_alloca)
5889         return true;
5890       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5891         return true;
5892
5893       return false;
5894     }
5895   else
5896     {
5897       /* If we decided that we didn't need a leaf frame pointer but then used
5898          LR in the function, then we'll want a frame pointer after all, so
5899          prevent this elimination to ensure a frame pointer is used.  */
5900       if (to == STACK_POINTER_REGNUM
5901           && flag_omit_leaf_frame_pointer
5902           && df_regs_ever_live_p (LR_REGNUM))
5903         return false;
5904     }
5905
5906   return true;
5907 }
5908
5909 HOST_WIDE_INT
5910 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5911 {
5912   aarch64_layout_frame ();
5913
5914   if (to == HARD_FRAME_POINTER_REGNUM)
5915     {
5916       if (from == ARG_POINTER_REGNUM)
5917         return cfun->machine->frame.hard_fp_offset;
5918
5919       if (from == FRAME_POINTER_REGNUM)
5920         return cfun->machine->frame.hard_fp_offset
5921                - cfun->machine->frame.locals_offset;
5922     }
5923
5924   if (to == STACK_POINTER_REGNUM)
5925     {
5926       if (from == FRAME_POINTER_REGNUM)
5927           return cfun->machine->frame.frame_size
5928                  - cfun->machine->frame.locals_offset;
5929     }
5930
5931   return cfun->machine->frame.frame_size;
5932 }
5933
5934 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5935    previous frame.  */
5936
5937 rtx
5938 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5939 {
5940   if (count != 0)
5941     return const0_rtx;
5942   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5943 }
5944
5945
5946 static void
5947 aarch64_asm_trampoline_template (FILE *f)
5948 {
5949   if (TARGET_ILP32)
5950     {
5951       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5952       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5953     }
5954   else
5955     {
5956       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5957       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5958     }
5959   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5960   assemble_aligned_integer (4, const0_rtx);
5961   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5962   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5963 }
5964
5965 static void
5966 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5967 {
5968   rtx fnaddr, mem, a_tramp;
5969   const int tramp_code_sz = 16;
5970
5971   /* Don't need to copy the trailing D-words, we fill those in below.  */
5972   emit_block_move (m_tramp, assemble_trampoline_template (),
5973                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5974   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5975   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5976   if (GET_MODE (fnaddr) != ptr_mode)
5977     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5978   emit_move_insn (mem, fnaddr);
5979
5980   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5981   emit_move_insn (mem, chain_value);
5982
5983   /* XXX We should really define a "clear_cache" pattern and use
5984      gen_clear_cache().  */
5985   a_tramp = XEXP (m_tramp, 0);
5986   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5987                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
5988                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5989                      ptr_mode);
5990 }
5991
5992 static unsigned char
5993 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5994 {
5995   switch (regclass)
5996     {
5997     case CALLER_SAVE_REGS:
5998     case POINTER_REGS:
5999     case GENERAL_REGS:
6000     case ALL_REGS:
6001     case FP_REGS:
6002     case FP_LO_REGS:
6003       return
6004         aarch64_vector_mode_p (mode)
6005           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6006           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6007     case STACK_REG:
6008       return 1;
6009
6010     case NO_REGS:
6011       return 0;
6012
6013     default:
6014       break;
6015     }
6016   gcc_unreachable ();
6017 }
6018
6019 static reg_class_t
6020 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6021 {
6022   if (regclass == POINTER_REGS)
6023     return GENERAL_REGS;
6024
6025   if (regclass == STACK_REG)
6026     {
6027       if (REG_P(x)
6028           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6029           return regclass;
6030
6031       return NO_REGS;
6032     }
6033
6034   /* Register eliminiation can result in a request for
6035      SP+constant->FP_REGS.  We cannot support such operations which
6036      use SP as source and an FP_REG as destination, so reject out
6037      right now.  */
6038   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6039     {
6040       rtx lhs = XEXP (x, 0);
6041
6042       /* Look through a possible SUBREG introduced by ILP32.  */
6043       if (GET_CODE (lhs) == SUBREG)
6044         lhs = SUBREG_REG (lhs);
6045
6046       gcc_assert (REG_P (lhs));
6047       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6048                                       POINTER_REGS));
6049       return NO_REGS;
6050     }
6051
6052   return regclass;
6053 }
6054
6055 void
6056 aarch64_asm_output_labelref (FILE* f, const char *name)
6057 {
6058   asm_fprintf (f, "%U%s", name);
6059 }
6060
6061 static void
6062 aarch64_elf_asm_constructor (rtx symbol, int priority)
6063 {
6064   if (priority == DEFAULT_INIT_PRIORITY)
6065     default_ctor_section_asm_out_constructor (symbol, priority);
6066   else
6067     {
6068       section *s;
6069       /* While priority is known to be in range [0, 65535], so 18 bytes
6070          would be enough, the compiler might not know that.  To avoid
6071          -Wformat-truncation false positive, use a larger size.  */
6072       char buf[23];
6073       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6074       s = get_section (buf, SECTION_WRITE, NULL);
6075       switch_to_section (s);
6076       assemble_align (POINTER_SIZE);
6077       assemble_aligned_integer (POINTER_BYTES, symbol);
6078     }
6079 }
6080
6081 static void
6082 aarch64_elf_asm_destructor (rtx symbol, int priority)
6083 {
6084   if (priority == DEFAULT_INIT_PRIORITY)
6085     default_dtor_section_asm_out_destructor (symbol, priority);
6086   else
6087     {
6088       section *s;
6089       /* While priority is known to be in range [0, 65535], so 18 bytes
6090          would be enough, the compiler might not know that.  To avoid
6091          -Wformat-truncation false positive, use a larger size.  */
6092       char buf[23];
6093       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6094       s = get_section (buf, SECTION_WRITE, NULL);
6095       switch_to_section (s);
6096       assemble_align (POINTER_SIZE);
6097       assemble_aligned_integer (POINTER_BYTES, symbol);
6098     }
6099 }
6100
6101 const char*
6102 aarch64_output_casesi (rtx *operands)
6103 {
6104   char buf[100];
6105   char label[100];
6106   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6107   int index;
6108   static const char *const patterns[4][2] =
6109   {
6110     {
6111       "ldrb\t%w3, [%0,%w1,uxtw]",
6112       "add\t%3, %4, %w3, sxtb #2"
6113     },
6114     {
6115       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6116       "add\t%3, %4, %w3, sxth #2"
6117     },
6118     {
6119       "ldr\t%w3, [%0,%w1,uxtw #2]",
6120       "add\t%3, %4, %w3, sxtw #2"
6121     },
6122     /* We assume that DImode is only generated when not optimizing and
6123        that we don't really need 64-bit address offsets.  That would
6124        imply an object file with 8GB of code in a single function!  */
6125     {
6126       "ldr\t%w3, [%0,%w1,uxtw #2]",
6127       "add\t%3, %4, %w3, sxtw #2"
6128     }
6129   };
6130
6131   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6132
6133   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6134
6135   gcc_assert (index >= 0 && index <= 3);
6136
6137   /* Need to implement table size reduction, by chaning the code below.  */
6138   output_asm_insn (patterns[index][0], operands);
6139   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6140   snprintf (buf, sizeof (buf),
6141             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6142   output_asm_insn (buf, operands);
6143   output_asm_insn (patterns[index][1], operands);
6144   output_asm_insn ("br\t%3", operands);
6145   assemble_label (asm_out_file, label);
6146   return "";
6147 }
6148
6149
6150 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6151    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6152    operator.  */
6153
6154 int
6155 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6156 {
6157   if (shift >= 0 && shift <= 3)
6158     {
6159       int size;
6160       for (size = 8; size <= 32; size *= 2)
6161         {
6162           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6163           if (mask == bits << shift)
6164             return size;
6165         }
6166     }
6167   return 0;
6168 }
6169
6170 /* Constant pools are per function only when PC relative
6171    literal loads are true or we are in the large memory
6172    model.  */
6173
6174 static inline bool
6175 aarch64_can_use_per_function_literal_pools_p (void)
6176 {
6177   return (aarch64_pcrelative_literal_loads
6178           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6179 }
6180
6181 static bool
6182 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6183 {
6184   /* Fixme:: In an ideal world this would work similar
6185      to the logic in aarch64_select_rtx_section but this
6186      breaks bootstrap in gcc go.  For now we workaround
6187      this by returning false here.  */
6188   return false;
6189 }
6190
6191 /* Select appropriate section for constants depending
6192    on where we place literal pools.  */
6193
6194 static section *
6195 aarch64_select_rtx_section (machine_mode mode,
6196                             rtx x,
6197                             unsigned HOST_WIDE_INT align)
6198 {
6199   if (aarch64_can_use_per_function_literal_pools_p ())
6200     return function_section (current_function_decl);
6201
6202   return default_elf_select_rtx_section (mode, x, align);
6203 }
6204
6205 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6206 void
6207 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6208                                   HOST_WIDE_INT offset)
6209 {
6210   /* When using per-function literal pools, we must ensure that any code
6211      section is aligned to the minimal instruction length, lest we get
6212      errors from the assembler re "unaligned instructions".  */
6213   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6214     ASM_OUTPUT_ALIGN (f, 2);
6215 }
6216
6217 /* Costs.  */
6218
6219 /* Helper function for rtx cost calculation.  Strip a shift expression
6220    from X.  Returns the inner operand if successful, or the original
6221    expression on failure.  */
6222 static rtx
6223 aarch64_strip_shift (rtx x)
6224 {
6225   rtx op = x;
6226
6227   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6228      we can convert both to ROR during final output.  */
6229   if ((GET_CODE (op) == ASHIFT
6230        || GET_CODE (op) == ASHIFTRT
6231        || GET_CODE (op) == LSHIFTRT
6232        || GET_CODE (op) == ROTATERT
6233        || GET_CODE (op) == ROTATE)
6234       && CONST_INT_P (XEXP (op, 1)))
6235     return XEXP (op, 0);
6236
6237   if (GET_CODE (op) == MULT
6238       && CONST_INT_P (XEXP (op, 1))
6239       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6240     return XEXP (op, 0);
6241
6242   return x;
6243 }
6244
6245 /* Helper function for rtx cost calculation.  Strip an extend
6246    expression from X.  Returns the inner operand if successful, or the
6247    original expression on failure.  We deal with a number of possible
6248    canonicalization variations here. If STRIP_SHIFT is true, then
6249    we can strip off a shift also.  */
6250 static rtx
6251 aarch64_strip_extend (rtx x, bool strip_shift)
6252 {
6253   rtx op = x;
6254
6255   /* Zero and sign extraction of a widened value.  */
6256   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6257       && XEXP (op, 2) == const0_rtx
6258       && GET_CODE (XEXP (op, 0)) == MULT
6259       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6260                                          XEXP (op, 1)))
6261     return XEXP (XEXP (op, 0), 0);
6262
6263   /* It can also be represented (for zero-extend) as an AND with an
6264      immediate.  */
6265   if (GET_CODE (op) == AND
6266       && GET_CODE (XEXP (op, 0)) == MULT
6267       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6268       && CONST_INT_P (XEXP (op, 1))
6269       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6270                            INTVAL (XEXP (op, 1))) != 0)
6271     return XEXP (XEXP (op, 0), 0);
6272
6273   /* Now handle extended register, as this may also have an optional
6274      left shift by 1..4.  */
6275   if (strip_shift
6276       && GET_CODE (op) == ASHIFT
6277       && CONST_INT_P (XEXP (op, 1))
6278       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6279     op = XEXP (op, 0);
6280
6281   if (GET_CODE (op) == ZERO_EXTEND
6282       || GET_CODE (op) == SIGN_EXTEND)
6283     op = XEXP (op, 0);
6284
6285   if (op != x)
6286     return op;
6287
6288   return x;
6289 }
6290
6291 /* Return true iff CODE is a shift supported in combination
6292    with arithmetic instructions.  */
6293
6294 static bool
6295 aarch64_shift_p (enum rtx_code code)
6296 {
6297   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6298 }
6299
6300
6301 /* Return true iff X is a cheap shift without a sign extend. */
6302
6303 static bool
6304 aarch64_cheap_mult_shift_p (rtx x)
6305 {
6306   rtx op0, op1;
6307
6308   op0 = XEXP (x, 0);
6309   op1 = XEXP (x, 1);
6310
6311   if (!(aarch64_tune_params.extra_tuning_flags
6312                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6313     return false;
6314
6315   if (GET_CODE (op0) == SIGN_EXTEND)
6316     return false;
6317
6318   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6319       && UINTVAL (op1) <= 4)
6320     return true;
6321
6322   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6323     return false;
6324
6325   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6326
6327   if (l2 > 0 && l2 <= 4)
6328     return true;
6329
6330   return false;
6331 }
6332
6333 /* Helper function for rtx cost calculation.  Calculate the cost of
6334    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6335    Return the calculated cost of the expression, recursing manually in to
6336    operands where needed.  */
6337
6338 static int
6339 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6340 {
6341   rtx op0, op1;
6342   const struct cpu_cost_table *extra_cost
6343     = aarch64_tune_params.insn_extra_cost;
6344   int cost = 0;
6345   bool compound_p = (outer == PLUS || outer == MINUS);
6346   machine_mode mode = GET_MODE (x);
6347
6348   gcc_checking_assert (code == MULT);
6349
6350   op0 = XEXP (x, 0);
6351   op1 = XEXP (x, 1);
6352
6353   if (VECTOR_MODE_P (mode))
6354     mode = GET_MODE_INNER (mode);
6355
6356   /* Integer multiply/fma.  */
6357   if (GET_MODE_CLASS (mode) == MODE_INT)
6358     {
6359       /* The multiply will be canonicalized as a shift, cost it as such.  */
6360       if (aarch64_shift_p (GET_CODE (x))
6361           || (CONST_INT_P (op1)
6362               && exact_log2 (INTVAL (op1)) > 0))
6363         {
6364           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6365                            || GET_CODE (op0) == SIGN_EXTEND;
6366           if (speed)
6367             {
6368               if (compound_p)
6369                 {
6370                   /* If the shift is considered cheap,
6371                      then don't add any cost. */
6372                   if (aarch64_cheap_mult_shift_p (x))
6373                     ;
6374                   else if (REG_P (op1))
6375                     /* ARITH + shift-by-register.  */
6376                     cost += extra_cost->alu.arith_shift_reg;
6377                   else if (is_extend)
6378                     /* ARITH + extended register.  We don't have a cost field
6379                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6380                     cost += extra_cost->alu.extend_arith;
6381                   else
6382                     /* ARITH + shift-by-immediate.  */
6383                     cost += extra_cost->alu.arith_shift;
6384                 }
6385               else
6386                 /* LSL (immediate).  */
6387                 cost += extra_cost->alu.shift;
6388
6389             }
6390           /* Strip extends as we will have costed them in the case above.  */
6391           if (is_extend)
6392             op0 = aarch64_strip_extend (op0, true);
6393
6394           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6395
6396           return cost;
6397         }
6398
6399       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6400          compound and let the below cases handle it.  After all, MNEG is a
6401          special-case alias of MSUB.  */
6402       if (GET_CODE (op0) == NEG)
6403         {
6404           op0 = XEXP (op0, 0);
6405           compound_p = true;
6406         }
6407
6408       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6409       if ((GET_CODE (op0) == ZERO_EXTEND
6410            && GET_CODE (op1) == ZERO_EXTEND)
6411           || (GET_CODE (op0) == SIGN_EXTEND
6412               && GET_CODE (op1) == SIGN_EXTEND))
6413         {
6414           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6415           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6416
6417           if (speed)
6418             {
6419               if (compound_p)
6420                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6421                 cost += extra_cost->mult[0].extend_add;
6422               else
6423                 /* MUL/SMULL/UMULL.  */
6424                 cost += extra_cost->mult[0].extend;
6425             }
6426
6427           return cost;
6428         }
6429
6430       /* This is either an integer multiply or a MADD.  In both cases
6431          we want to recurse and cost the operands.  */
6432       cost += rtx_cost (op0, mode, MULT, 0, speed);
6433       cost += rtx_cost (op1, mode, MULT, 1, speed);
6434
6435       if (speed)
6436         {
6437           if (compound_p)
6438             /* MADD/MSUB.  */
6439             cost += extra_cost->mult[mode == DImode].add;
6440           else
6441             /* MUL.  */
6442             cost += extra_cost->mult[mode == DImode].simple;
6443         }
6444
6445       return cost;
6446     }
6447   else
6448     {
6449       if (speed)
6450         {
6451           /* Floating-point FMA/FMUL can also support negations of the
6452              operands, unless the rounding mode is upward or downward in
6453              which case FNMUL is different than FMUL with operand negation.  */
6454           bool neg0 = GET_CODE (op0) == NEG;
6455           bool neg1 = GET_CODE (op1) == NEG;
6456           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6457             {
6458               if (neg0)
6459                 op0 = XEXP (op0, 0);
6460               if (neg1)
6461                 op1 = XEXP (op1, 0);
6462             }
6463
6464           if (compound_p)
6465             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6466             cost += extra_cost->fp[mode == DFmode].fma;
6467           else
6468             /* FMUL/FNMUL.  */
6469             cost += extra_cost->fp[mode == DFmode].mult;
6470         }
6471
6472       cost += rtx_cost (op0, mode, MULT, 0, speed);
6473       cost += rtx_cost (op1, mode, MULT, 1, speed);
6474       return cost;
6475     }
6476 }
6477
6478 static int
6479 aarch64_address_cost (rtx x,
6480                       machine_mode mode,
6481                       addr_space_t as ATTRIBUTE_UNUSED,
6482                       bool speed)
6483 {
6484   enum rtx_code c = GET_CODE (x);
6485   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6486   struct aarch64_address_info info;
6487   int cost = 0;
6488   info.shift = 0;
6489
6490   if (!aarch64_classify_address (&info, x, mode, c, false))
6491     {
6492       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6493         {
6494           /* This is a CONST or SYMBOL ref which will be split
6495              in a different way depending on the code model in use.
6496              Cost it through the generic infrastructure.  */
6497           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6498           /* Divide through by the cost of one instruction to
6499              bring it to the same units as the address costs.  */
6500           cost_symbol_ref /= COSTS_N_INSNS (1);
6501           /* The cost is then the cost of preparing the address,
6502              followed by an immediate (possibly 0) offset.  */
6503           return cost_symbol_ref + addr_cost->imm_offset;
6504         }
6505       else
6506         {
6507           /* This is most likely a jump table from a case
6508              statement.  */
6509           return addr_cost->register_offset;
6510         }
6511     }
6512
6513   switch (info.type)
6514     {
6515       case ADDRESS_LO_SUM:
6516       case ADDRESS_SYMBOLIC:
6517       case ADDRESS_REG_IMM:
6518         cost += addr_cost->imm_offset;
6519         break;
6520
6521       case ADDRESS_REG_WB:
6522         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6523           cost += addr_cost->pre_modify;
6524         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6525           cost += addr_cost->post_modify;
6526         else
6527           gcc_unreachable ();
6528
6529         break;
6530
6531       case ADDRESS_REG_REG:
6532         cost += addr_cost->register_offset;
6533         break;
6534
6535       case ADDRESS_REG_SXTW:
6536         cost += addr_cost->register_sextend;
6537         break;
6538
6539       case ADDRESS_REG_UXTW:
6540         cost += addr_cost->register_zextend;
6541         break;
6542
6543       default:
6544         gcc_unreachable ();
6545     }
6546
6547
6548   if (info.shift > 0)
6549     {
6550       /* For the sake of calculating the cost of the shifted register
6551          component, we can treat same sized modes in the same way.  */
6552       switch (GET_MODE_BITSIZE (mode))
6553         {
6554           case 16:
6555             cost += addr_cost->addr_scale_costs.hi;
6556             break;
6557
6558           case 32:
6559             cost += addr_cost->addr_scale_costs.si;
6560             break;
6561
6562           case 64:
6563             cost += addr_cost->addr_scale_costs.di;
6564             break;
6565
6566           /* We can't tell, or this is a 128-bit vector.  */
6567           default:
6568             cost += addr_cost->addr_scale_costs.ti;
6569             break;
6570         }
6571     }
6572
6573   return cost;
6574 }
6575
6576 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6577    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6578    to be taken.  */
6579
6580 int
6581 aarch64_branch_cost (bool speed_p, bool predictable_p)
6582 {
6583   /* When optimizing for speed, use the cost of unpredictable branches.  */
6584   const struct cpu_branch_cost *branch_costs =
6585     aarch64_tune_params.branch_costs;
6586
6587   if (!speed_p || predictable_p)
6588     return branch_costs->predictable;
6589   else
6590     return branch_costs->unpredictable;
6591 }
6592
6593 /* Return true if the RTX X in mode MODE is a zero or sign extract
6594    usable in an ADD or SUB (extended register) instruction.  */
6595 static bool
6596 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6597 {
6598   /* Catch add with a sign extract.
6599      This is add_<optab><mode>_multp2.  */
6600   if (GET_CODE (x) == SIGN_EXTRACT
6601       || GET_CODE (x) == ZERO_EXTRACT)
6602     {
6603       rtx op0 = XEXP (x, 0);
6604       rtx op1 = XEXP (x, 1);
6605       rtx op2 = XEXP (x, 2);
6606
6607       if (GET_CODE (op0) == MULT
6608           && CONST_INT_P (op1)
6609           && op2 == const0_rtx
6610           && CONST_INT_P (XEXP (op0, 1))
6611           && aarch64_is_extend_from_extract (mode,
6612                                              XEXP (op0, 1),
6613                                              op1))
6614         {
6615           return true;
6616         }
6617     }
6618   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6619      No shift.  */
6620   else if (GET_CODE (x) == SIGN_EXTEND
6621            || GET_CODE (x) == ZERO_EXTEND)
6622     return REG_P (XEXP (x, 0));
6623
6624   return false;
6625 }
6626
6627 static bool
6628 aarch64_frint_unspec_p (unsigned int u)
6629 {
6630   switch (u)
6631     {
6632       case UNSPEC_FRINTZ:
6633       case UNSPEC_FRINTP:
6634       case UNSPEC_FRINTM:
6635       case UNSPEC_FRINTA:
6636       case UNSPEC_FRINTN:
6637       case UNSPEC_FRINTX:
6638       case UNSPEC_FRINTI:
6639         return true;
6640
6641       default:
6642         return false;
6643     }
6644 }
6645
6646 /* Return true iff X is an rtx that will match an extr instruction
6647    i.e. as described in the *extr<mode>5_insn family of patterns.
6648    OP0 and OP1 will be set to the operands of the shifts involved
6649    on success and will be NULL_RTX otherwise.  */
6650
6651 static bool
6652 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6653 {
6654   rtx op0, op1;
6655   machine_mode mode = GET_MODE (x);
6656
6657   *res_op0 = NULL_RTX;
6658   *res_op1 = NULL_RTX;
6659
6660   if (GET_CODE (x) != IOR)
6661     return false;
6662
6663   op0 = XEXP (x, 0);
6664   op1 = XEXP (x, 1);
6665
6666   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6667       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6668     {
6669      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6670       if (GET_CODE (op1) == ASHIFT)
6671         std::swap (op0, op1);
6672
6673       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6674         return false;
6675
6676       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6677       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6678
6679       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6680           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6681         {
6682           *res_op0 = XEXP (op0, 0);
6683           *res_op1 = XEXP (op1, 0);
6684           return true;
6685         }
6686     }
6687
6688   return false;
6689 }
6690
6691 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6692    storing it in *COST.  Result is true if the total cost of the operation
6693    has now been calculated.  */
6694 static bool
6695 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6696 {
6697   rtx inner;
6698   rtx comparator;
6699   enum rtx_code cmpcode;
6700
6701   if (COMPARISON_P (op0))
6702     {
6703       inner = XEXP (op0, 0);
6704       comparator = XEXP (op0, 1);
6705       cmpcode = GET_CODE (op0);
6706     }
6707   else
6708     {
6709       inner = op0;
6710       comparator = const0_rtx;
6711       cmpcode = NE;
6712     }
6713
6714   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6715     {
6716       /* Conditional branch.  */
6717       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6718         return true;
6719       else
6720         {
6721           if (cmpcode == NE || cmpcode == EQ)
6722             {
6723               if (comparator == const0_rtx)
6724                 {
6725                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6726                   if (GET_CODE (inner) == ZERO_EXTRACT)
6727                     /* TBZ/TBNZ.  */
6728                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6729                                        ZERO_EXTRACT, 0, speed);
6730                   else
6731                     /* CBZ/CBNZ.  */
6732                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6733
6734                 return true;
6735               }
6736             }
6737           else if (cmpcode == LT || cmpcode == GE)
6738             {
6739               /* TBZ/TBNZ.  */
6740               if (comparator == const0_rtx)
6741                 return true;
6742             }
6743         }
6744     }
6745   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6746     {
6747       /* CCMP.  */
6748       if (GET_CODE (op1) == COMPARE)
6749         {
6750           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6751           if (XEXP (op1, 1) == const0_rtx)
6752             *cost += 1;
6753           if (speed)
6754             {
6755               machine_mode mode = GET_MODE (XEXP (op1, 0));
6756               const struct cpu_cost_table *extra_cost
6757                 = aarch64_tune_params.insn_extra_cost;
6758
6759               if (GET_MODE_CLASS (mode) == MODE_INT)
6760                 *cost += extra_cost->alu.arith;
6761               else
6762                 *cost += extra_cost->fp[mode == DFmode].compare;
6763             }
6764           return true;
6765         }
6766
6767       /* It's a conditional operation based on the status flags,
6768          so it must be some flavor of CSEL.  */
6769
6770       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6771       if (GET_CODE (op1) == NEG
6772           || GET_CODE (op1) == NOT
6773           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6774         op1 = XEXP (op1, 0);
6775       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6776         {
6777           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6778           op1 = XEXP (op1, 0);
6779           op2 = XEXP (op2, 0);
6780         }
6781
6782       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6783       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6784       return true;
6785     }
6786
6787   /* We don't know what this is, cost all operands.  */
6788   return false;
6789 }
6790
6791 /* Check whether X is a bitfield operation of the form shift + extend that
6792    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6793    operand to which the bitfield operation is applied.  Otherwise return
6794    NULL_RTX.  */
6795
6796 static rtx
6797 aarch64_extend_bitfield_pattern_p (rtx x)
6798 {
6799   rtx_code outer_code = GET_CODE (x);
6800   machine_mode outer_mode = GET_MODE (x);
6801
6802   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6803       && outer_mode != SImode && outer_mode != DImode)
6804     return NULL_RTX;
6805
6806   rtx inner = XEXP (x, 0);
6807   rtx_code inner_code = GET_CODE (inner);
6808   machine_mode inner_mode = GET_MODE (inner);
6809   rtx op = NULL_RTX;
6810
6811   switch (inner_code)
6812     {
6813       case ASHIFT:
6814         if (CONST_INT_P (XEXP (inner, 1))
6815             && (inner_mode == QImode || inner_mode == HImode))
6816           op = XEXP (inner, 0);
6817         break;
6818       case LSHIFTRT:
6819         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6820             && (inner_mode == QImode || inner_mode == HImode))
6821           op = XEXP (inner, 0);
6822         break;
6823       case ASHIFTRT:
6824         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6825             && (inner_mode == QImode || inner_mode == HImode))
6826           op = XEXP (inner, 0);
6827         break;
6828       default:
6829         break;
6830     }
6831
6832   return op;
6833 }
6834
6835 /* Return true if the mask and a shift amount from an RTX of the form
6836    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6837    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6838
6839 bool
6840 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6841 {
6842   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6843          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6844          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6845          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6846 }
6847
6848 /* Calculate the cost of calculating X, storing it in *COST.  Result
6849    is true if the total cost of the operation has now been calculated.  */
6850 static bool
6851 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6852                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6853 {
6854   rtx op0, op1, op2;
6855   const struct cpu_cost_table *extra_cost
6856     = aarch64_tune_params.insn_extra_cost;
6857   int code = GET_CODE (x);
6858   scalar_int_mode int_mode;
6859
6860   /* By default, assume that everything has equivalent cost to the
6861      cheapest instruction.  Any additional costs are applied as a delta
6862      above this default.  */
6863   *cost = COSTS_N_INSNS (1);
6864
6865   switch (code)
6866     {
6867     case SET:
6868       /* The cost depends entirely on the operands to SET.  */
6869       *cost = 0;
6870       op0 = SET_DEST (x);
6871       op1 = SET_SRC (x);
6872
6873       switch (GET_CODE (op0))
6874         {
6875         case MEM:
6876           if (speed)
6877             {
6878               rtx address = XEXP (op0, 0);
6879               if (VECTOR_MODE_P (mode))
6880                 *cost += extra_cost->ldst.storev;
6881               else if (GET_MODE_CLASS (mode) == MODE_INT)
6882                 *cost += extra_cost->ldst.store;
6883               else if (mode == SFmode)
6884                 *cost += extra_cost->ldst.storef;
6885               else if (mode == DFmode)
6886                 *cost += extra_cost->ldst.stored;
6887
6888               *cost +=
6889                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6890                                                      0, speed));
6891             }
6892
6893           *cost += rtx_cost (op1, mode, SET, 1, speed);
6894           return true;
6895
6896         case SUBREG:
6897           if (! REG_P (SUBREG_REG (op0)))
6898             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6899
6900           /* Fall through.  */
6901         case REG:
6902           /* The cost is one per vector-register copied.  */
6903           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6904             {
6905               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6906                               / GET_MODE_SIZE (V4SImode);
6907               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6908             }
6909           /* const0_rtx is in general free, but we will use an
6910              instruction to set a register to 0.  */
6911           else if (REG_P (op1) || op1 == const0_rtx)
6912             {
6913               /* The cost is 1 per register copied.  */
6914               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6915                               / UNITS_PER_WORD;
6916               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6917             }
6918           else
6919             /* Cost is just the cost of the RHS of the set.  */
6920             *cost += rtx_cost (op1, mode, SET, 1, speed);
6921           return true;
6922
6923         case ZERO_EXTRACT:
6924         case SIGN_EXTRACT:
6925           /* Bit-field insertion.  Strip any redundant widening of
6926              the RHS to meet the width of the target.  */
6927           if (GET_CODE (op1) == SUBREG)
6928             op1 = SUBREG_REG (op1);
6929           if ((GET_CODE (op1) == ZERO_EXTEND
6930                || GET_CODE (op1) == SIGN_EXTEND)
6931               && CONST_INT_P (XEXP (op0, 1))
6932               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6933                   >= INTVAL (XEXP (op0, 1))))
6934             op1 = XEXP (op1, 0);
6935
6936           if (CONST_INT_P (op1))
6937             {
6938               /* MOV immediate is assumed to always be cheap.  */
6939               *cost = COSTS_N_INSNS (1);
6940             }
6941           else
6942             {
6943               /* BFM.  */
6944               if (speed)
6945                 *cost += extra_cost->alu.bfi;
6946               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6947             }
6948
6949           return true;
6950
6951         default:
6952           /* We can't make sense of this, assume default cost.  */
6953           *cost = COSTS_N_INSNS (1);
6954           return false;
6955         }
6956       return false;
6957
6958     case CONST_INT:
6959       /* If an instruction can incorporate a constant within the
6960          instruction, the instruction's expression avoids calling
6961          rtx_cost() on the constant.  If rtx_cost() is called on a
6962          constant, then it is usually because the constant must be
6963          moved into a register by one or more instructions.
6964
6965          The exception is constant 0, which can be expressed
6966          as XZR/WZR and is therefore free.  The exception to this is
6967          if we have (set (reg) (const0_rtx)) in which case we must cost
6968          the move.  However, we can catch that when we cost the SET, so
6969          we don't need to consider that here.  */
6970       if (x == const0_rtx)
6971         *cost = 0;
6972       else
6973         {
6974           /* To an approximation, building any other constant is
6975              proportionally expensive to the number of instructions
6976              required to build that constant.  This is true whether we
6977              are compiling for SPEED or otherwise.  */
6978           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6979                                  (NULL_RTX, x, false, mode));
6980         }
6981       return true;
6982
6983     case CONST_DOUBLE:
6984
6985       /* First determine number of instructions to do the move
6986           as an integer constant.  */
6987       if (!aarch64_float_const_representable_p (x)
6988            && !aarch64_can_const_movi_rtx_p (x, mode)
6989            && aarch64_float_const_rtx_p (x))
6990         {
6991           unsigned HOST_WIDE_INT ival;
6992           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6993           gcc_assert (succeed);
6994
6995           machine_mode imode = (mode == HFmode
6996                                 ? SImode
6997                                 : int_mode_for_mode (mode).require ());
6998           int ncost = aarch64_internal_mov_immediate
6999                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7000           *cost += COSTS_N_INSNS (ncost);
7001           return true;
7002         }
7003
7004       if (speed)
7005         {
7006           /* mov[df,sf]_aarch64.  */
7007           if (aarch64_float_const_representable_p (x))
7008             /* FMOV (scalar immediate).  */
7009             *cost += extra_cost->fp[mode == DFmode].fpconst;
7010           else if (!aarch64_float_const_zero_rtx_p (x))
7011             {
7012               /* This will be a load from memory.  */
7013               if (mode == DFmode)
7014                 *cost += extra_cost->ldst.loadd;
7015               else
7016                 *cost += extra_cost->ldst.loadf;
7017             }
7018           else
7019             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7020                or MOV v0.s[0], wzr - neither of which are modeled by the
7021                cost tables.  Just use the default cost.  */
7022             {
7023             }
7024         }
7025
7026       return true;
7027
7028     case MEM:
7029       if (speed)
7030         {
7031           /* For loads we want the base cost of a load, plus an
7032              approximation for the additional cost of the addressing
7033              mode.  */
7034           rtx address = XEXP (x, 0);
7035           if (VECTOR_MODE_P (mode))
7036             *cost += extra_cost->ldst.loadv;
7037           else if (GET_MODE_CLASS (mode) == MODE_INT)
7038             *cost += extra_cost->ldst.load;
7039           else if (mode == SFmode)
7040             *cost += extra_cost->ldst.loadf;
7041           else if (mode == DFmode)
7042             *cost += extra_cost->ldst.loadd;
7043
7044           *cost +=
7045                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7046                                                      0, speed));
7047         }
7048
7049       return true;
7050
7051     case NEG:
7052       op0 = XEXP (x, 0);
7053
7054       if (VECTOR_MODE_P (mode))
7055         {
7056           if (speed)
7057             {
7058               /* FNEG.  */
7059               *cost += extra_cost->vect.alu;
7060             }
7061           return false;
7062         }
7063
7064       if (GET_MODE_CLASS (mode) == MODE_INT)
7065         {
7066           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7067               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7068             {
7069               /* CSETM.  */
7070               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7071               return true;
7072             }
7073
7074           /* Cost this as SUB wzr, X.  */
7075           op0 = CONST0_RTX (mode);
7076           op1 = XEXP (x, 0);
7077           goto cost_minus;
7078         }
7079
7080       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7081         {
7082           /* Support (neg(fma...)) as a single instruction only if
7083              sign of zeros is unimportant.  This matches the decision
7084              making in aarch64.md.  */
7085           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7086             {
7087               /* FNMADD.  */
7088               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7089               return true;
7090             }
7091           if (GET_CODE (op0) == MULT)
7092             {
7093               /* FNMUL.  */
7094               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7095               return true;
7096             }
7097           if (speed)
7098             /* FNEG.  */
7099             *cost += extra_cost->fp[mode == DFmode].neg;
7100           return false;
7101         }
7102
7103       return false;
7104
7105     case CLRSB:
7106     case CLZ:
7107       if (speed)
7108         {
7109           if (VECTOR_MODE_P (mode))
7110             *cost += extra_cost->vect.alu;
7111           else
7112             *cost += extra_cost->alu.clz;
7113         }
7114
7115       return false;
7116
7117     case COMPARE:
7118       op0 = XEXP (x, 0);
7119       op1 = XEXP (x, 1);
7120
7121       if (op1 == const0_rtx
7122           && GET_CODE (op0) == AND)
7123         {
7124           x = op0;
7125           mode = GET_MODE (op0);
7126           goto cost_logic;
7127         }
7128
7129       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7130         {
7131           /* TODO: A write to the CC flags possibly costs extra, this
7132              needs encoding in the cost tables.  */
7133
7134           mode = GET_MODE (op0);
7135           /* ANDS.  */
7136           if (GET_CODE (op0) == AND)
7137             {
7138               x = op0;
7139               goto cost_logic;
7140             }
7141
7142           if (GET_CODE (op0) == PLUS)
7143             {
7144               /* ADDS (and CMN alias).  */
7145               x = op0;
7146               goto cost_plus;
7147             }
7148
7149           if (GET_CODE (op0) == MINUS)
7150             {
7151               /* SUBS.  */
7152               x = op0;
7153               goto cost_minus;
7154             }
7155
7156           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7157               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7158               && CONST_INT_P (XEXP (op0, 2)))
7159             {
7160               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7161                  Handle it here directly rather than going to cost_logic
7162                  since we know the immediate generated for the TST is valid
7163                  so we can avoid creating an intermediate rtx for it only
7164                  for costing purposes.  */
7165               if (speed)
7166                 *cost += extra_cost->alu.logical;
7167
7168               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7169                                  ZERO_EXTRACT, 0, speed);
7170               return true;
7171             }
7172
7173           if (GET_CODE (op1) == NEG)
7174             {
7175               /* CMN.  */
7176               if (speed)
7177                 *cost += extra_cost->alu.arith;
7178
7179               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7180               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7181               return true;
7182             }
7183
7184           /* CMP.
7185
7186              Compare can freely swap the order of operands, and
7187              canonicalization puts the more complex operation first.
7188              But the integer MINUS logic expects the shift/extend
7189              operation in op1.  */
7190           if (! (REG_P (op0)
7191                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7192           {
7193             op0 = XEXP (x, 1);
7194             op1 = XEXP (x, 0);
7195           }
7196           goto cost_minus;
7197         }
7198
7199       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7200         {
7201           /* FCMP.  */
7202           if (speed)
7203             *cost += extra_cost->fp[mode == DFmode].compare;
7204
7205           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7206             {
7207               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7208               /* FCMP supports constant 0.0 for no extra cost. */
7209               return true;
7210             }
7211           return false;
7212         }
7213
7214       if (VECTOR_MODE_P (mode))
7215         {
7216           /* Vector compare.  */
7217           if (speed)
7218             *cost += extra_cost->vect.alu;
7219
7220           if (aarch64_float_const_zero_rtx_p (op1))
7221             {
7222               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7223                  cost.  */
7224               return true;
7225             }
7226           return false;
7227         }
7228       return false;
7229
7230     case MINUS:
7231       {
7232         op0 = XEXP (x, 0);
7233         op1 = XEXP (x, 1);
7234
7235 cost_minus:
7236         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7237
7238         /* Detect valid immediates.  */
7239         if ((GET_MODE_CLASS (mode) == MODE_INT
7240              || (GET_MODE_CLASS (mode) == MODE_CC
7241                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7242             && CONST_INT_P (op1)
7243             && aarch64_uimm12_shift (INTVAL (op1)))
7244           {
7245             if (speed)
7246               /* SUB(S) (immediate).  */
7247               *cost += extra_cost->alu.arith;
7248             return true;
7249           }
7250
7251         /* Look for SUB (extended register).  */
7252         if (aarch64_rtx_arith_op_extract_p (op1, mode))
7253           {
7254             if (speed)
7255               *cost += extra_cost->alu.extend_arith;
7256
7257             op1 = aarch64_strip_extend (op1, true);
7258             *cost += rtx_cost (op1, VOIDmode,
7259                                (enum rtx_code) GET_CODE (op1), 0, speed);
7260             return true;
7261           }
7262
7263         rtx new_op1 = aarch64_strip_extend (op1, false);
7264
7265         /* Cost this as an FMA-alike operation.  */
7266         if ((GET_CODE (new_op1) == MULT
7267              || aarch64_shift_p (GET_CODE (new_op1)))
7268             && code != COMPARE)
7269           {
7270             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7271                                             (enum rtx_code) code,
7272                                             speed);
7273             return true;
7274           }
7275
7276         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7277
7278         if (speed)
7279           {
7280             if (VECTOR_MODE_P (mode))
7281               {
7282                 /* Vector SUB.  */
7283                 *cost += extra_cost->vect.alu;
7284               }
7285             else if (GET_MODE_CLASS (mode) == MODE_INT)
7286               {
7287                 /* SUB(S).  */
7288                 *cost += extra_cost->alu.arith;
7289               }
7290             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7291               {
7292                 /* FSUB.  */
7293                 *cost += extra_cost->fp[mode == DFmode].addsub;
7294               }
7295           }
7296         return true;
7297       }
7298
7299     case PLUS:
7300       {
7301         rtx new_op0;
7302
7303         op0 = XEXP (x, 0);
7304         op1 = XEXP (x, 1);
7305
7306 cost_plus:
7307         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7308             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7309           {
7310             /* CSINC.  */
7311             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7312             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7313             return true;
7314           }
7315
7316         if (GET_MODE_CLASS (mode) == MODE_INT
7317             && CONST_INT_P (op1)
7318             && aarch64_uimm12_shift (INTVAL (op1)))
7319           {
7320             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7321
7322             if (speed)
7323               /* ADD (immediate).  */
7324               *cost += extra_cost->alu.arith;
7325             return true;
7326           }
7327
7328         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7329
7330         /* Look for ADD (extended register).  */
7331         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7332           {
7333             if (speed)
7334               *cost += extra_cost->alu.extend_arith;
7335
7336             op0 = aarch64_strip_extend (op0, true);
7337             *cost += rtx_cost (op0, VOIDmode,
7338                                (enum rtx_code) GET_CODE (op0), 0, speed);
7339             return true;
7340           }
7341
7342         /* Strip any extend, leave shifts behind as we will
7343            cost them through mult_cost.  */
7344         new_op0 = aarch64_strip_extend (op0, false);
7345
7346         if (GET_CODE (new_op0) == MULT
7347             || aarch64_shift_p (GET_CODE (new_op0)))
7348           {
7349             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7350                                             speed);
7351             return true;
7352           }
7353
7354         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7355
7356         if (speed)
7357           {
7358             if (VECTOR_MODE_P (mode))
7359               {
7360                 /* Vector ADD.  */
7361                 *cost += extra_cost->vect.alu;
7362               }
7363             else if (GET_MODE_CLASS (mode) == MODE_INT)
7364               {
7365                 /* ADD.  */
7366                 *cost += extra_cost->alu.arith;
7367               }
7368             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7369               {
7370                 /* FADD.  */
7371                 *cost += extra_cost->fp[mode == DFmode].addsub;
7372               }
7373           }
7374         return true;
7375       }
7376
7377     case BSWAP:
7378       *cost = COSTS_N_INSNS (1);
7379
7380       if (speed)
7381         {
7382           if (VECTOR_MODE_P (mode))
7383             *cost += extra_cost->vect.alu;
7384           else
7385             *cost += extra_cost->alu.rev;
7386         }
7387       return false;
7388
7389     case IOR:
7390       if (aarch_rev16_p (x))
7391         {
7392           *cost = COSTS_N_INSNS (1);
7393
7394           if (speed)
7395             {
7396               if (VECTOR_MODE_P (mode))
7397                 *cost += extra_cost->vect.alu;
7398               else
7399                 *cost += extra_cost->alu.rev;
7400             }
7401           return true;
7402         }
7403
7404       if (aarch64_extr_rtx_p (x, &op0, &op1))
7405         {
7406           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7407           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7408           if (speed)
7409             *cost += extra_cost->alu.shift;
7410
7411           return true;
7412         }
7413     /* Fall through.  */
7414     case XOR:
7415     case AND:
7416     cost_logic:
7417       op0 = XEXP (x, 0);
7418       op1 = XEXP (x, 1);
7419
7420       if (VECTOR_MODE_P (mode))
7421         {
7422           if (speed)
7423             *cost += extra_cost->vect.alu;
7424           return true;
7425         }
7426
7427       if (code == AND
7428           && GET_CODE (op0) == MULT
7429           && CONST_INT_P (XEXP (op0, 1))
7430           && CONST_INT_P (op1)
7431           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7432                                INTVAL (op1)) != 0)
7433         {
7434           /* This is a UBFM/SBFM.  */
7435           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7436           if (speed)
7437             *cost += extra_cost->alu.bfx;
7438           return true;
7439         }
7440
7441       if (is_int_mode (mode, &int_mode))
7442         {
7443           if (CONST_INT_P (op1))
7444             {
7445               /* We have a mask + shift version of a UBFIZ
7446                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7447               if (GET_CODE (op0) == ASHIFT
7448                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7449                                                          XEXP (op0, 1)))
7450                 {
7451                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7452                                      (enum rtx_code) code, 0, speed);
7453                   if (speed)
7454                     *cost += extra_cost->alu.bfx;
7455
7456                   return true;
7457                 }
7458               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7459                 {
7460                 /* We possibly get the immediate for free, this is not
7461                    modelled.  */
7462                   *cost += rtx_cost (op0, int_mode,
7463                                      (enum rtx_code) code, 0, speed);
7464                   if (speed)
7465                     *cost += extra_cost->alu.logical;
7466
7467                   return true;
7468                 }
7469             }
7470           else
7471             {
7472               rtx new_op0 = op0;
7473
7474               /* Handle ORN, EON, or BIC.  */
7475               if (GET_CODE (op0) == NOT)
7476                 op0 = XEXP (op0, 0);
7477
7478               new_op0 = aarch64_strip_shift (op0);
7479
7480               /* If we had a shift on op0 then this is a logical-shift-
7481                  by-register/immediate operation.  Otherwise, this is just
7482                  a logical operation.  */
7483               if (speed)
7484                 {
7485                   if (new_op0 != op0)
7486                     {
7487                       /* Shift by immediate.  */
7488                       if (CONST_INT_P (XEXP (op0, 1)))
7489                         *cost += extra_cost->alu.log_shift;
7490                       else
7491                         *cost += extra_cost->alu.log_shift_reg;
7492                     }
7493                   else
7494                     *cost += extra_cost->alu.logical;
7495                 }
7496
7497               /* In both cases we want to cost both operands.  */
7498               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7499                                  0, speed);
7500               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7501                                  1, speed);
7502
7503               return true;
7504             }
7505         }
7506       return false;
7507
7508     case NOT:
7509       x = XEXP (x, 0);
7510       op0 = aarch64_strip_shift (x);
7511
7512       if (VECTOR_MODE_P (mode))
7513         {
7514           /* Vector NOT.  */
7515           *cost += extra_cost->vect.alu;
7516           return false;
7517         }
7518
7519       /* MVN-shifted-reg.  */
7520       if (op0 != x)
7521         {
7522           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7523
7524           if (speed)
7525             *cost += extra_cost->alu.log_shift;
7526
7527           return true;
7528         }
7529       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7530          Handle the second form here taking care that 'a' in the above can
7531          be a shift.  */
7532       else if (GET_CODE (op0) == XOR)
7533         {
7534           rtx newop0 = XEXP (op0, 0);
7535           rtx newop1 = XEXP (op0, 1);
7536           rtx op0_stripped = aarch64_strip_shift (newop0);
7537
7538           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7539           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7540
7541           if (speed)
7542             {
7543               if (op0_stripped != newop0)
7544                 *cost += extra_cost->alu.log_shift;
7545               else
7546                 *cost += extra_cost->alu.logical;
7547             }
7548
7549           return true;
7550         }
7551       /* MVN.  */
7552       if (speed)
7553         *cost += extra_cost->alu.logical;
7554
7555       return false;
7556
7557     case ZERO_EXTEND:
7558
7559       op0 = XEXP (x, 0);
7560       /* If a value is written in SI mode, then zero extended to DI
7561          mode, the operation will in general be free as a write to
7562          a 'w' register implicitly zeroes the upper bits of an 'x'
7563          register.  However, if this is
7564
7565            (set (reg) (zero_extend (reg)))
7566
7567          we must cost the explicit register move.  */
7568       if (mode == DImode
7569           && GET_MODE (op0) == SImode
7570           && outer == SET)
7571         {
7572           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7573
7574         /* If OP_COST is non-zero, then the cost of the zero extend
7575            is effectively the cost of the inner operation.  Otherwise
7576            we have a MOV instruction and we take the cost from the MOV
7577            itself.  This is true independently of whether we are
7578            optimizing for space or time.  */
7579           if (op_cost)
7580             *cost = op_cost;
7581
7582           return true;
7583         }
7584       else if (MEM_P (op0))
7585         {
7586           /* All loads can zero extend to any size for free.  */
7587           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7588           return true;
7589         }
7590
7591       op0 = aarch64_extend_bitfield_pattern_p (x);
7592       if (op0)
7593         {
7594           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7595           if (speed)
7596             *cost += extra_cost->alu.bfx;
7597           return true;
7598         }
7599
7600       if (speed)
7601         {
7602           if (VECTOR_MODE_P (mode))
7603             {
7604               /* UMOV.  */
7605               *cost += extra_cost->vect.alu;
7606             }
7607           else
7608             {
7609               /* We generate an AND instead of UXTB/UXTH.  */
7610               *cost += extra_cost->alu.logical;
7611             }
7612         }
7613       return false;
7614
7615     case SIGN_EXTEND:
7616       if (MEM_P (XEXP (x, 0)))
7617         {
7618           /* LDRSH.  */
7619           if (speed)
7620             {
7621               rtx address = XEXP (XEXP (x, 0), 0);
7622               *cost += extra_cost->ldst.load_sign_extend;
7623
7624               *cost +=
7625                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7626                                                      0, speed));
7627             }
7628           return true;
7629         }
7630
7631       op0 = aarch64_extend_bitfield_pattern_p (x);
7632       if (op0)
7633         {
7634           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7635           if (speed)
7636             *cost += extra_cost->alu.bfx;
7637           return true;
7638         }
7639
7640       if (speed)
7641         {
7642           if (VECTOR_MODE_P (mode))
7643             *cost += extra_cost->vect.alu;
7644           else
7645             *cost += extra_cost->alu.extend;
7646         }
7647       return false;
7648
7649     case ASHIFT:
7650       op0 = XEXP (x, 0);
7651       op1 = XEXP (x, 1);
7652
7653       if (CONST_INT_P (op1))
7654         {
7655           if (speed)
7656             {
7657               if (VECTOR_MODE_P (mode))
7658                 {
7659                   /* Vector shift (immediate).  */
7660                   *cost += extra_cost->vect.alu;
7661                 }
7662               else
7663                 {
7664                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7665                      aliases.  */
7666                   *cost += extra_cost->alu.shift;
7667                 }
7668             }
7669
7670           /* We can incorporate zero/sign extend for free.  */
7671           if (GET_CODE (op0) == ZERO_EXTEND
7672               || GET_CODE (op0) == SIGN_EXTEND)
7673             op0 = XEXP (op0, 0);
7674
7675           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7676           return true;
7677         }
7678       else
7679         {
7680           if (VECTOR_MODE_P (mode))
7681             {
7682               if (speed)
7683                 /* Vector shift (register).  */
7684                 *cost += extra_cost->vect.alu;
7685             }
7686           else
7687             {
7688               if (speed)
7689                 /* LSLV.  */
7690                 *cost += extra_cost->alu.shift_reg;
7691
7692               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7693                   && CONST_INT_P (XEXP (op1, 1))
7694                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7695                 {
7696                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7697                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7698                      don't recurse into it.  */
7699                   return true;
7700                 }
7701             }
7702           return false;  /* All arguments need to be in registers.  */
7703         }
7704
7705     case ROTATE:
7706     case ROTATERT:
7707     case LSHIFTRT:
7708     case ASHIFTRT:
7709       op0 = XEXP (x, 0);
7710       op1 = XEXP (x, 1);
7711
7712       if (CONST_INT_P (op1))
7713         {
7714           /* ASR (immediate) and friends.  */
7715           if (speed)
7716             {
7717               if (VECTOR_MODE_P (mode))
7718                 *cost += extra_cost->vect.alu;
7719               else
7720                 *cost += extra_cost->alu.shift;
7721             }
7722
7723           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7724           return true;
7725         }
7726       else
7727         {
7728           if (VECTOR_MODE_P (mode))
7729             {
7730               if (speed)
7731                 /* Vector shift (register).  */
7732                 *cost += extra_cost->vect.alu;
7733             }
7734           else
7735             {
7736               if (speed)
7737                 /* ASR (register) and friends.  */
7738                 *cost += extra_cost->alu.shift_reg;
7739
7740               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7741                   && CONST_INT_P (XEXP (op1, 1))
7742                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7743                 {
7744                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7745                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7746                      don't recurse into it.  */
7747                   return true;
7748                 }
7749             }
7750           return false;  /* All arguments need to be in registers.  */
7751         }
7752
7753     case SYMBOL_REF:
7754
7755       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7756           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7757         {
7758           /* LDR.  */
7759           if (speed)
7760             *cost += extra_cost->ldst.load;
7761         }
7762       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7763                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7764         {
7765           /* ADRP, followed by ADD.  */
7766           *cost += COSTS_N_INSNS (1);
7767           if (speed)
7768             *cost += 2 * extra_cost->alu.arith;
7769         }
7770       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7771                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7772         {
7773           /* ADR.  */
7774           if (speed)
7775             *cost += extra_cost->alu.arith;
7776         }
7777
7778       if (flag_pic)
7779         {
7780           /* One extra load instruction, after accessing the GOT.  */
7781           *cost += COSTS_N_INSNS (1);
7782           if (speed)
7783             *cost += extra_cost->ldst.load;
7784         }
7785       return true;
7786
7787     case HIGH:
7788     case LO_SUM:
7789       /* ADRP/ADD (immediate).  */
7790       if (speed)
7791         *cost += extra_cost->alu.arith;
7792       return true;
7793
7794     case ZERO_EXTRACT:
7795     case SIGN_EXTRACT:
7796       /* UBFX/SBFX.  */
7797       if (speed)
7798         {
7799           if (VECTOR_MODE_P (mode))
7800             *cost += extra_cost->vect.alu;
7801           else
7802             *cost += extra_cost->alu.bfx;
7803         }
7804
7805       /* We can trust that the immediates used will be correct (there
7806          are no by-register forms), so we need only cost op0.  */
7807       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7808       return true;
7809
7810     case MULT:
7811       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7812       /* aarch64_rtx_mult_cost always handles recursion to its
7813          operands.  */
7814       return true;
7815
7816     case MOD:
7817     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7818        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7819        an unconditional negate.  This case should only ever be reached through
7820        the set_smod_pow2_cheap check in expmed.c.  */
7821       if (CONST_INT_P (XEXP (x, 1))
7822           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7823           && (mode == SImode || mode == DImode))
7824         {
7825           /* We expand to 4 instructions.  Reset the baseline.  */
7826           *cost = COSTS_N_INSNS (4);
7827
7828           if (speed)
7829             *cost += 2 * extra_cost->alu.logical
7830                      + 2 * extra_cost->alu.arith;
7831
7832           return true;
7833         }
7834
7835     /* Fall-through.  */
7836     case UMOD:
7837       if (speed)
7838         {
7839           /* Slighly prefer UMOD over SMOD.  */
7840           if (VECTOR_MODE_P (mode))
7841             *cost += extra_cost->vect.alu;
7842           else if (GET_MODE_CLASS (mode) == MODE_INT)
7843             *cost += (extra_cost->mult[mode == DImode].add
7844                       + extra_cost->mult[mode == DImode].idiv
7845                       + (code == MOD ? 1 : 0));
7846         }
7847       return false;  /* All arguments need to be in registers.  */
7848
7849     case DIV:
7850     case UDIV:
7851     case SQRT:
7852       if (speed)
7853         {
7854           if (VECTOR_MODE_P (mode))
7855             *cost += extra_cost->vect.alu;
7856           else if (GET_MODE_CLASS (mode) == MODE_INT)
7857             /* There is no integer SQRT, so only DIV and UDIV can get
7858                here.  */
7859             *cost += (extra_cost->mult[mode == DImode].idiv
7860                      /* Slighly prefer UDIV over SDIV.  */
7861                      + (code == DIV ? 1 : 0));
7862           else
7863             *cost += extra_cost->fp[mode == DFmode].div;
7864         }
7865       return false;  /* All arguments need to be in registers.  */
7866
7867     case IF_THEN_ELSE:
7868       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7869                                          XEXP (x, 2), cost, speed);
7870
7871     case EQ:
7872     case NE:
7873     case GT:
7874     case GTU:
7875     case LT:
7876     case LTU:
7877     case GE:
7878     case GEU:
7879     case LE:
7880     case LEU:
7881
7882       return false; /* All arguments must be in registers.  */
7883
7884     case FMA:
7885       op0 = XEXP (x, 0);
7886       op1 = XEXP (x, 1);
7887       op2 = XEXP (x, 2);
7888
7889       if (speed)
7890         {
7891           if (VECTOR_MODE_P (mode))
7892             *cost += extra_cost->vect.alu;
7893           else
7894             *cost += extra_cost->fp[mode == DFmode].fma;
7895         }
7896
7897       /* FMSUB, FNMADD, and FNMSUB are free.  */
7898       if (GET_CODE (op0) == NEG)
7899         op0 = XEXP (op0, 0);
7900
7901       if (GET_CODE (op2) == NEG)
7902         op2 = XEXP (op2, 0);
7903
7904       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7905          and the by-element operand as operand 0.  */
7906       if (GET_CODE (op1) == NEG)
7907         op1 = XEXP (op1, 0);
7908
7909       /* Catch vector-by-element operations.  The by-element operand can
7910          either be (vec_duplicate (vec_select (x))) or just
7911          (vec_select (x)), depending on whether we are multiplying by
7912          a vector or a scalar.
7913
7914          Canonicalization is not very good in these cases, FMA4 will put the
7915          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7916       if (GET_CODE (op0) == VEC_DUPLICATE)
7917         op0 = XEXP (op0, 0);
7918       else if (GET_CODE (op1) == VEC_DUPLICATE)
7919         op1 = XEXP (op1, 0);
7920
7921       if (GET_CODE (op0) == VEC_SELECT)
7922         op0 = XEXP (op0, 0);
7923       else if (GET_CODE (op1) == VEC_SELECT)
7924         op1 = XEXP (op1, 0);
7925
7926       /* If the remaining parameters are not registers,
7927          get the cost to put them into registers.  */
7928       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7929       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7930       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7931       return true;
7932
7933     case FLOAT:
7934     case UNSIGNED_FLOAT:
7935       if (speed)
7936         *cost += extra_cost->fp[mode == DFmode].fromint;
7937       return false;
7938
7939     case FLOAT_EXTEND:
7940       if (speed)
7941         {
7942           if (VECTOR_MODE_P (mode))
7943             {
7944               /*Vector truncate.  */
7945               *cost += extra_cost->vect.alu;
7946             }
7947           else
7948             *cost += extra_cost->fp[mode == DFmode].widen;
7949         }
7950       return false;
7951
7952     case FLOAT_TRUNCATE:
7953       if (speed)
7954         {
7955           if (VECTOR_MODE_P (mode))
7956             {
7957               /*Vector conversion.  */
7958               *cost += extra_cost->vect.alu;
7959             }
7960           else
7961             *cost += extra_cost->fp[mode == DFmode].narrow;
7962         }
7963       return false;
7964
7965     case FIX:
7966     case UNSIGNED_FIX:
7967       x = XEXP (x, 0);
7968       /* Strip the rounding part.  They will all be implemented
7969          by the fcvt* family of instructions anyway.  */
7970       if (GET_CODE (x) == UNSPEC)
7971         {
7972           unsigned int uns_code = XINT (x, 1);
7973
7974           if (uns_code == UNSPEC_FRINTA
7975               || uns_code == UNSPEC_FRINTM
7976               || uns_code == UNSPEC_FRINTN
7977               || uns_code == UNSPEC_FRINTP
7978               || uns_code == UNSPEC_FRINTZ)
7979             x = XVECEXP (x, 0, 0);
7980         }
7981
7982       if (speed)
7983         {
7984           if (VECTOR_MODE_P (mode))
7985             *cost += extra_cost->vect.alu;
7986           else
7987             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7988         }
7989
7990       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7991          fixed-point fcvt.  */
7992       if (GET_CODE (x) == MULT
7993           && ((VECTOR_MODE_P (mode)
7994                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7995               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7996         {
7997           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7998                              0, speed);
7999           return true;
8000         }
8001
8002       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8003       return true;
8004
8005     case ABS:
8006       if (VECTOR_MODE_P (mode))
8007         {
8008           /* ABS (vector).  */
8009           if (speed)
8010             *cost += extra_cost->vect.alu;
8011         }
8012       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8013         {
8014           op0 = XEXP (x, 0);
8015
8016           /* FABD, which is analogous to FADD.  */
8017           if (GET_CODE (op0) == MINUS)
8018             {
8019               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8020               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8021               if (speed)
8022                 *cost += extra_cost->fp[mode == DFmode].addsub;
8023
8024               return true;
8025             }
8026           /* Simple FABS is analogous to FNEG.  */
8027           if (speed)
8028             *cost += extra_cost->fp[mode == DFmode].neg;
8029         }
8030       else
8031         {
8032           /* Integer ABS will either be split to
8033              two arithmetic instructions, or will be an ABS
8034              (scalar), which we don't model.  */
8035           *cost = COSTS_N_INSNS (2);
8036           if (speed)
8037             *cost += 2 * extra_cost->alu.arith;
8038         }
8039       return false;
8040
8041     case SMAX:
8042     case SMIN:
8043       if (speed)
8044         {
8045           if (VECTOR_MODE_P (mode))
8046             *cost += extra_cost->vect.alu;
8047           else
8048             {
8049               /* FMAXNM/FMINNM/FMAX/FMIN.
8050                  TODO: This may not be accurate for all implementations, but
8051                  we do not model this in the cost tables.  */
8052               *cost += extra_cost->fp[mode == DFmode].addsub;
8053             }
8054         }
8055       return false;
8056
8057     case UNSPEC:
8058       /* The floating point round to integer frint* instructions.  */
8059       if (aarch64_frint_unspec_p (XINT (x, 1)))
8060         {
8061           if (speed)
8062             *cost += extra_cost->fp[mode == DFmode].roundint;
8063
8064           return false;
8065         }
8066
8067       if (XINT (x, 1) == UNSPEC_RBIT)
8068         {
8069           if (speed)
8070             *cost += extra_cost->alu.rev;
8071
8072           return false;
8073         }
8074       break;
8075
8076     case TRUNCATE:
8077
8078       /* Decompose <su>muldi3_highpart.  */
8079       if (/* (truncate:DI  */
8080           mode == DImode
8081           /*   (lshiftrt:TI  */
8082           && GET_MODE (XEXP (x, 0)) == TImode
8083           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8084           /*      (mult:TI  */
8085           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8086           /*        (ANY_EXTEND:TI (reg:DI))
8087                     (ANY_EXTEND:TI (reg:DI)))  */
8088           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8089                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8090               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8091                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8092           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8093           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8094           /*     (const_int 64)  */
8095           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8096           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8097         {
8098           /* UMULH/SMULH.  */
8099           if (speed)
8100             *cost += extra_cost->mult[mode == DImode].extend;
8101           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8102                              mode, MULT, 0, speed);
8103           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8104                              mode, MULT, 1, speed);
8105           return true;
8106         }
8107
8108       /* Fall through.  */
8109     default:
8110       break;
8111     }
8112
8113   if (dump_file
8114       && flag_aarch64_verbose_cost)
8115     fprintf (dump_file,
8116       "\nFailed to cost RTX.  Assuming default cost.\n");
8117
8118   return true;
8119 }
8120
8121 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8122    calculated for X.  This cost is stored in *COST.  Returns true
8123    if the total cost of X was calculated.  */
8124 static bool
8125 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8126                    int param, int *cost, bool speed)
8127 {
8128   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8129
8130   if (dump_file
8131       && flag_aarch64_verbose_cost)
8132     {
8133       print_rtl_single (dump_file, x);
8134       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8135                speed ? "Hot" : "Cold",
8136                *cost, result ? "final" : "partial");
8137     }
8138
8139   return result;
8140 }
8141
8142 static int
8143 aarch64_register_move_cost (machine_mode mode,
8144                             reg_class_t from_i, reg_class_t to_i)
8145 {
8146   enum reg_class from = (enum reg_class) from_i;
8147   enum reg_class to = (enum reg_class) to_i;
8148   const struct cpu_regmove_cost *regmove_cost
8149     = aarch64_tune_params.regmove_cost;
8150
8151   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8152   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8153     to = GENERAL_REGS;
8154
8155   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8156     from = GENERAL_REGS;
8157
8158   /* Moving between GPR and stack cost is the same as GP2GP.  */
8159   if ((from == GENERAL_REGS && to == STACK_REG)
8160       || (to == GENERAL_REGS && from == STACK_REG))
8161     return regmove_cost->GP2GP;
8162
8163   /* To/From the stack register, we move via the gprs.  */
8164   if (to == STACK_REG || from == STACK_REG)
8165     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8166             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8167
8168   if (GET_MODE_SIZE (mode) == 16)
8169     {
8170       /* 128-bit operations on general registers require 2 instructions.  */
8171       if (from == GENERAL_REGS && to == GENERAL_REGS)
8172         return regmove_cost->GP2GP * 2;
8173       else if (from == GENERAL_REGS)
8174         return regmove_cost->GP2FP * 2;
8175       else if (to == GENERAL_REGS)
8176         return regmove_cost->FP2GP * 2;
8177
8178       /* When AdvSIMD instructions are disabled it is not possible to move
8179          a 128-bit value directly between Q registers.  This is handled in
8180          secondary reload.  A general register is used as a scratch to move
8181          the upper DI value and the lower DI value is moved directly,
8182          hence the cost is the sum of three moves. */
8183       if (! TARGET_SIMD)
8184         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8185
8186       return regmove_cost->FP2FP;
8187     }
8188
8189   if (from == GENERAL_REGS && to == GENERAL_REGS)
8190     return regmove_cost->GP2GP;
8191   else if (from == GENERAL_REGS)
8192     return regmove_cost->GP2FP;
8193   else if (to == GENERAL_REGS)
8194     return regmove_cost->FP2GP;
8195
8196   return regmove_cost->FP2FP;
8197 }
8198
8199 static int
8200 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8201                           reg_class_t rclass ATTRIBUTE_UNUSED,
8202                           bool in ATTRIBUTE_UNUSED)
8203 {
8204   return aarch64_tune_params.memmov_cost;
8205 }
8206
8207 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8208    to optimize 1.0/sqrt.  */
8209
8210 static bool
8211 use_rsqrt_p (machine_mode mode)
8212 {
8213   return (!flag_trapping_math
8214           && flag_unsafe_math_optimizations
8215           && ((aarch64_tune_params.approx_modes->recip_sqrt
8216                & AARCH64_APPROX_MODE (mode))
8217               || flag_mrecip_low_precision_sqrt));
8218 }
8219
8220 /* Function to decide when to use the approximate reciprocal square root
8221    builtin.  */
8222
8223 static tree
8224 aarch64_builtin_reciprocal (tree fndecl)
8225 {
8226   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8227
8228   if (!use_rsqrt_p (mode))
8229     return NULL_TREE;
8230   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8231 }
8232
8233 typedef rtx (*rsqrte_type) (rtx, rtx);
8234
8235 /* Select reciprocal square root initial estimate insn depending on machine
8236    mode.  */
8237
8238 static rsqrte_type
8239 get_rsqrte_type (machine_mode mode)
8240 {
8241   switch (mode)
8242   {
8243     case E_DFmode:   return gen_aarch64_rsqrtedf;
8244     case E_SFmode:   return gen_aarch64_rsqrtesf;
8245     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8246     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8247     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8248     default: gcc_unreachable ();
8249   }
8250 }
8251
8252 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8253
8254 /* Select reciprocal square root series step insn depending on machine mode.  */
8255
8256 static rsqrts_type
8257 get_rsqrts_type (machine_mode mode)
8258 {
8259   switch (mode)
8260   {
8261     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8262     case E_SFmode:   return gen_aarch64_rsqrtssf;
8263     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8264     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8265     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8266     default: gcc_unreachable ();
8267   }
8268 }
8269
8270 /* Emit instruction sequence to compute either the approximate square root
8271    or its approximate reciprocal, depending on the flag RECP, and return
8272    whether the sequence was emitted or not.  */
8273
8274 bool
8275 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8276 {
8277   machine_mode mode = GET_MODE (dst);
8278
8279   if (GET_MODE_INNER (mode) == HFmode)
8280     {
8281       gcc_assert (!recp);
8282       return false;
8283     }
8284
8285   machine_mode mmsk
8286     = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)).require (),
8287                        GET_MODE_NUNITS (mode));
8288   if (!recp)
8289     {
8290       if (!(flag_mlow_precision_sqrt
8291             || (aarch64_tune_params.approx_modes->sqrt
8292                 & AARCH64_APPROX_MODE (mode))))
8293         return false;
8294
8295       if (flag_finite_math_only
8296           || flag_trapping_math
8297           || !flag_unsafe_math_optimizations
8298           || optimize_function_for_size_p (cfun))
8299         return false;
8300     }
8301   else
8302     /* Caller assumes we cannot fail.  */
8303     gcc_assert (use_rsqrt_p (mode));
8304
8305
8306   rtx xmsk = gen_reg_rtx (mmsk);
8307   if (!recp)
8308     /* When calculating the approximate square root, compare the
8309        argument with 0.0 and create a mask.  */
8310     emit_insn (gen_rtx_SET (xmsk,
8311                             gen_rtx_NEG (mmsk,
8312                                          gen_rtx_EQ (mmsk, src,
8313                                                      CONST0_RTX (mode)))));
8314
8315   /* Estimate the approximate reciprocal square root.  */
8316   rtx xdst = gen_reg_rtx (mode);
8317   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8318
8319   /* Iterate over the series twice for SF and thrice for DF.  */
8320   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8321
8322   /* Optionally iterate over the series once less for faster performance
8323      while sacrificing the accuracy.  */
8324   if ((recp && flag_mrecip_low_precision_sqrt)
8325       || (!recp && flag_mlow_precision_sqrt))
8326     iterations--;
8327
8328   /* Iterate over the series to calculate the approximate reciprocal square
8329      root.  */
8330   rtx x1 = gen_reg_rtx (mode);
8331   while (iterations--)
8332     {
8333       rtx x2 = gen_reg_rtx (mode);
8334       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8335
8336       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8337
8338       if (iterations > 0)
8339         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8340     }
8341
8342   if (!recp)
8343     {
8344       /* Qualify the approximate reciprocal square root when the argument is
8345          0.0 by squashing the intermediary result to 0.0.  */
8346       rtx xtmp = gen_reg_rtx (mmsk);
8347       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8348                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8349       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8350
8351       /* Calculate the approximate square root.  */
8352       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8353     }
8354
8355   /* Finalize the approximation.  */
8356   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8357
8358   return true;
8359 }
8360
8361 typedef rtx (*recpe_type) (rtx, rtx);
8362
8363 /* Select reciprocal initial estimate insn depending on machine mode.  */
8364
8365 static recpe_type
8366 get_recpe_type (machine_mode mode)
8367 {
8368   switch (mode)
8369   {
8370     case E_SFmode:   return (gen_aarch64_frecpesf);
8371     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8372     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8373     case E_DFmode:   return (gen_aarch64_frecpedf);
8374     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8375     default:         gcc_unreachable ();
8376   }
8377 }
8378
8379 typedef rtx (*recps_type) (rtx, rtx, rtx);
8380
8381 /* Select reciprocal series step insn depending on machine mode.  */
8382
8383 static recps_type
8384 get_recps_type (machine_mode mode)
8385 {
8386   switch (mode)
8387   {
8388     case E_SFmode:   return (gen_aarch64_frecpssf);
8389     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8390     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8391     case E_DFmode:   return (gen_aarch64_frecpsdf);
8392     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8393     default:         gcc_unreachable ();
8394   }
8395 }
8396
8397 /* Emit the instruction sequence to compute the approximation for the division
8398    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8399
8400 bool
8401 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8402 {
8403   machine_mode mode = GET_MODE (quo);
8404
8405   if (GET_MODE_INNER (mode) == HFmode)
8406     return false;
8407
8408   bool use_approx_division_p = (flag_mlow_precision_div
8409                                 || (aarch64_tune_params.approx_modes->division
8410                                     & AARCH64_APPROX_MODE (mode)));
8411
8412   if (!flag_finite_math_only
8413       || flag_trapping_math
8414       || !flag_unsafe_math_optimizations
8415       || optimize_function_for_size_p (cfun)
8416       || !use_approx_division_p)
8417     return false;
8418
8419   /* Estimate the approximate reciprocal.  */
8420   rtx xrcp = gen_reg_rtx (mode);
8421   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8422
8423   /* Iterate over the series twice for SF and thrice for DF.  */
8424   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8425
8426   /* Optionally iterate over the series once less for faster performance,
8427      while sacrificing the accuracy.  */
8428   if (flag_mlow_precision_div)
8429     iterations--;
8430
8431   /* Iterate over the series to calculate the approximate reciprocal.  */
8432   rtx xtmp = gen_reg_rtx (mode);
8433   while (iterations--)
8434     {
8435       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8436
8437       if (iterations > 0)
8438         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8439     }
8440
8441   if (num != CONST1_RTX (mode))
8442     {
8443       /* As the approximate reciprocal of DEN is already calculated, only
8444          calculate the approximate division when NUM is not 1.0.  */
8445       rtx xnum = force_reg (mode, num);
8446       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8447     }
8448
8449   /* Finalize the approximation.  */
8450   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8451   return true;
8452 }
8453
8454 /* Return the number of instructions that can be issued per cycle.  */
8455 static int
8456 aarch64_sched_issue_rate (void)
8457 {
8458   return aarch64_tune_params.issue_rate;
8459 }
8460
8461 static int
8462 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8463 {
8464   int issue_rate = aarch64_sched_issue_rate ();
8465
8466   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8467 }
8468
8469
8470 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8471    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8472    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8473
8474 static int
8475 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8476                                                     int ready_index)
8477 {
8478   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8479 }
8480
8481
8482 /* Vectorizer cost model target hooks.  */
8483
8484 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8485 static int
8486 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8487                                     tree vectype,
8488                                     int misalign ATTRIBUTE_UNUSED)
8489 {
8490   unsigned elements;
8491   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8492   bool fp = false;
8493
8494   if (vectype != NULL)
8495     fp = FLOAT_TYPE_P (vectype);
8496
8497   switch (type_of_cost)
8498     {
8499       case scalar_stmt:
8500         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8501
8502       case scalar_load:
8503         return costs->scalar_load_cost;
8504
8505       case scalar_store:
8506         return costs->scalar_store_cost;
8507
8508       case vector_stmt:
8509         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8510
8511       case vector_load:
8512         return costs->vec_align_load_cost;
8513
8514       case vector_store:
8515         return costs->vec_store_cost;
8516
8517       case vec_to_scalar:
8518         return costs->vec_to_scalar_cost;
8519
8520       case scalar_to_vec:
8521         return costs->scalar_to_vec_cost;
8522
8523       case unaligned_load:
8524         return costs->vec_unalign_load_cost;
8525
8526       case unaligned_store:
8527         return costs->vec_unalign_store_cost;
8528
8529       case cond_branch_taken:
8530         return costs->cond_taken_branch_cost;
8531
8532       case cond_branch_not_taken:
8533         return costs->cond_not_taken_branch_cost;
8534
8535       case vec_perm:
8536         return costs->vec_permute_cost;
8537
8538       case vec_promote_demote:
8539         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8540
8541       case vec_construct:
8542         elements = TYPE_VECTOR_SUBPARTS (vectype);
8543         return elements / 2 + 1;
8544
8545       default:
8546         gcc_unreachable ();
8547     }
8548 }
8549
8550 /* Implement targetm.vectorize.add_stmt_cost.  */
8551 static unsigned
8552 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8553                        struct _stmt_vec_info *stmt_info, int misalign,
8554                        enum vect_cost_model_location where)
8555 {
8556   unsigned *cost = (unsigned *) data;
8557   unsigned retval = 0;
8558
8559   if (flag_vect_cost_model)
8560     {
8561       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8562       int stmt_cost =
8563             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8564
8565       /* Statements in an inner loop relative to the loop being
8566          vectorized are weighted more heavily.  The value here is
8567          arbitrary and could potentially be improved with analysis.  */
8568       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8569         count *= 50; /*  FIXME  */
8570
8571       retval = (unsigned) (count * stmt_cost);
8572       cost[where] += retval;
8573     }
8574
8575   return retval;
8576 }
8577
8578 static void initialize_aarch64_code_model (struct gcc_options *);
8579
8580 /* Parse the TO_PARSE string and put the architecture struct that it
8581    selects into RES and the architectural features into ISA_FLAGS.
8582    Return an aarch64_parse_opt_result describing the parse result.
8583    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8584
8585 static enum aarch64_parse_opt_result
8586 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8587                     unsigned long *isa_flags)
8588 {
8589   char *ext;
8590   const struct processor *arch;
8591   char *str = (char *) alloca (strlen (to_parse) + 1);
8592   size_t len;
8593
8594   strcpy (str, to_parse);
8595
8596   ext = strchr (str, '+');
8597
8598   if (ext != NULL)
8599     len = ext - str;
8600   else
8601     len = strlen (str);
8602
8603   if (len == 0)
8604     return AARCH64_PARSE_MISSING_ARG;
8605
8606
8607   /* Loop through the list of supported ARCHes to find a match.  */
8608   for (arch = all_architectures; arch->name != NULL; arch++)
8609     {
8610       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8611         {
8612           unsigned long isa_temp = arch->flags;
8613
8614           if (ext != NULL)
8615             {
8616               /* TO_PARSE string contains at least one extension.  */
8617               enum aarch64_parse_opt_result ext_res
8618                 = aarch64_parse_extension (ext, &isa_temp);
8619
8620               if (ext_res != AARCH64_PARSE_OK)
8621                 return ext_res;
8622             }
8623           /* Extension parsing was successful.  Confirm the result
8624              arch and ISA flags.  */
8625           *res = arch;
8626           *isa_flags = isa_temp;
8627           return AARCH64_PARSE_OK;
8628         }
8629     }
8630
8631   /* ARCH name not found in list.  */
8632   return AARCH64_PARSE_INVALID_ARG;
8633 }
8634
8635 /* Parse the TO_PARSE string and put the result tuning in RES and the
8636    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8637    describing the parse result.  If there is an error parsing, RES and
8638    ISA_FLAGS are left unchanged.  */
8639
8640 static enum aarch64_parse_opt_result
8641 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8642                    unsigned long *isa_flags)
8643 {
8644   char *ext;
8645   const struct processor *cpu;
8646   char *str = (char *) alloca (strlen (to_parse) + 1);
8647   size_t len;
8648
8649   strcpy (str, to_parse);
8650
8651   ext = strchr (str, '+');
8652
8653   if (ext != NULL)
8654     len = ext - str;
8655   else
8656     len = strlen (str);
8657
8658   if (len == 0)
8659     return AARCH64_PARSE_MISSING_ARG;
8660
8661
8662   /* Loop through the list of supported CPUs to find a match.  */
8663   for (cpu = all_cores; cpu->name != NULL; cpu++)
8664     {
8665       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8666         {
8667           unsigned long isa_temp = cpu->flags;
8668
8669
8670           if (ext != NULL)
8671             {
8672               /* TO_PARSE string contains at least one extension.  */
8673               enum aarch64_parse_opt_result ext_res
8674                 = aarch64_parse_extension (ext, &isa_temp);
8675
8676               if (ext_res != AARCH64_PARSE_OK)
8677                 return ext_res;
8678             }
8679           /* Extension parsing was successfull.  Confirm the result
8680              cpu and ISA flags.  */
8681           *res = cpu;
8682           *isa_flags = isa_temp;
8683           return AARCH64_PARSE_OK;
8684         }
8685     }
8686
8687   /* CPU name not found in list.  */
8688   return AARCH64_PARSE_INVALID_ARG;
8689 }
8690
8691 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8692    Return an aarch64_parse_opt_result describing the parse result.
8693    If the parsing fails the RES does not change.  */
8694
8695 static enum aarch64_parse_opt_result
8696 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8697 {
8698   const struct processor *cpu;
8699   char *str = (char *) alloca (strlen (to_parse) + 1);
8700
8701   strcpy (str, to_parse);
8702
8703   /* Loop through the list of supported CPUs to find a match.  */
8704   for (cpu = all_cores; cpu->name != NULL; cpu++)
8705     {
8706       if (strcmp (cpu->name, str) == 0)
8707         {
8708           *res = cpu;
8709           return AARCH64_PARSE_OK;
8710         }
8711     }
8712
8713   /* CPU name not found in list.  */
8714   return AARCH64_PARSE_INVALID_ARG;
8715 }
8716
8717 /* Parse TOKEN, which has length LENGTH to see if it is an option
8718    described in FLAG.  If it is, return the index bit for that fusion type.
8719    If not, error (printing OPTION_NAME) and return zero.  */
8720
8721 static unsigned int
8722 aarch64_parse_one_option_token (const char *token,
8723                                 size_t length,
8724                                 const struct aarch64_flag_desc *flag,
8725                                 const char *option_name)
8726 {
8727   for (; flag->name != NULL; flag++)
8728     {
8729       if (length == strlen (flag->name)
8730           && !strncmp (flag->name, token, length))
8731         return flag->flag;
8732     }
8733
8734   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8735   return 0;
8736 }
8737
8738 /* Parse OPTION which is a comma-separated list of flags to enable.
8739    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8740    default state we inherit from the CPU tuning structures.  OPTION_NAME
8741    gives the top-level option we are parsing in the -moverride string,
8742    for use in error messages.  */
8743
8744 static unsigned int
8745 aarch64_parse_boolean_options (const char *option,
8746                                const struct aarch64_flag_desc *flags,
8747                                unsigned int initial_state,
8748                                const char *option_name)
8749 {
8750   const char separator = '.';
8751   const char* specs = option;
8752   const char* ntoken = option;
8753   unsigned int found_flags = initial_state;
8754
8755   while ((ntoken = strchr (specs, separator)))
8756     {
8757       size_t token_length = ntoken - specs;
8758       unsigned token_ops = aarch64_parse_one_option_token (specs,
8759                                                            token_length,
8760                                                            flags,
8761                                                            option_name);
8762       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8763          in the token stream, reset the supported operations.  So:
8764
8765            adrp+add.cmp+branch.none.adrp+add
8766
8767            would have the result of turning on only adrp+add fusion.  */
8768       if (!token_ops)
8769         found_flags = 0;
8770
8771       found_flags |= token_ops;
8772       specs = ++ntoken;
8773     }
8774
8775   /* We ended with a comma, print something.  */
8776   if (!(*specs))
8777     {
8778       error ("%s string ill-formed\n", option_name);
8779       return 0;
8780     }
8781
8782   /* We still have one more token to parse.  */
8783   size_t token_length = strlen (specs);
8784   unsigned token_ops = aarch64_parse_one_option_token (specs,
8785                                                        token_length,
8786                                                        flags,
8787                                                        option_name);
8788    if (!token_ops)
8789      found_flags = 0;
8790
8791   found_flags |= token_ops;
8792   return found_flags;
8793 }
8794
8795 /* Support for overriding instruction fusion.  */
8796
8797 static void
8798 aarch64_parse_fuse_string (const char *fuse_string,
8799                             struct tune_params *tune)
8800 {
8801   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8802                                                      aarch64_fusible_pairs,
8803                                                      tune->fusible_ops,
8804                                                      "fuse=");
8805 }
8806
8807 /* Support for overriding other tuning flags.  */
8808
8809 static void
8810 aarch64_parse_tune_string (const char *tune_string,
8811                             struct tune_params *tune)
8812 {
8813   tune->extra_tuning_flags
8814     = aarch64_parse_boolean_options (tune_string,
8815                                      aarch64_tuning_flags,
8816                                      tune->extra_tuning_flags,
8817                                      "tune=");
8818 }
8819
8820 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8821    we understand.  If it is, extract the option string and handoff to
8822    the appropriate function.  */
8823
8824 void
8825 aarch64_parse_one_override_token (const char* token,
8826                                   size_t length,
8827                                   struct tune_params *tune)
8828 {
8829   const struct aarch64_tuning_override_function *fn
8830     = aarch64_tuning_override_functions;
8831
8832   const char *option_part = strchr (token, '=');
8833   if (!option_part)
8834     {
8835       error ("tuning string missing in option (%s)", token);
8836       return;
8837     }
8838
8839   /* Get the length of the option name.  */
8840   length = option_part - token;
8841   /* Skip the '=' to get to the option string.  */
8842   option_part++;
8843
8844   for (; fn->name != NULL; fn++)
8845     {
8846       if (!strncmp (fn->name, token, length))
8847         {
8848           fn->parse_override (option_part, tune);
8849           return;
8850         }
8851     }
8852
8853   error ("unknown tuning option (%s)",token);
8854   return;
8855 }
8856
8857 /* A checking mechanism for the implementation of the tls size.  */
8858
8859 static void
8860 initialize_aarch64_tls_size (struct gcc_options *opts)
8861 {
8862   if (aarch64_tls_size == 0)
8863     aarch64_tls_size = 24;
8864
8865   switch (opts->x_aarch64_cmodel_var)
8866     {
8867     case AARCH64_CMODEL_TINY:
8868       /* Both the default and maximum TLS size allowed under tiny is 1M which
8869          needs two instructions to address, so we clamp the size to 24.  */
8870       if (aarch64_tls_size > 24)
8871         aarch64_tls_size = 24;
8872       break;
8873     case AARCH64_CMODEL_SMALL:
8874       /* The maximum TLS size allowed under small is 4G.  */
8875       if (aarch64_tls_size > 32)
8876         aarch64_tls_size = 32;
8877       break;
8878     case AARCH64_CMODEL_LARGE:
8879       /* The maximum TLS size allowed under large is 16E.
8880          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8881       if (aarch64_tls_size > 48)
8882         aarch64_tls_size = 48;
8883       break;
8884     default:
8885       gcc_unreachable ();
8886     }
8887
8888   return;
8889 }
8890
8891 /* Parse STRING looking for options in the format:
8892      string     :: option:string
8893      option     :: name=substring
8894      name       :: {a-z}
8895      substring  :: defined by option.  */
8896
8897 static void
8898 aarch64_parse_override_string (const char* input_string,
8899                                struct tune_params* tune)
8900 {
8901   const char separator = ':';
8902   size_t string_length = strlen (input_string) + 1;
8903   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8904   char *string = string_root;
8905   strncpy (string, input_string, string_length);
8906   string[string_length - 1] = '\0';
8907
8908   char* ntoken = string;
8909
8910   while ((ntoken = strchr (string, separator)))
8911     {
8912       size_t token_length = ntoken - string;
8913       /* Make this substring look like a string.  */
8914       *ntoken = '\0';
8915       aarch64_parse_one_override_token (string, token_length, tune);
8916       string = ++ntoken;
8917     }
8918
8919   /* One last option to parse.  */
8920   aarch64_parse_one_override_token (string, strlen (string), tune);
8921   free (string_root);
8922 }
8923
8924
8925 static void
8926 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8927 {
8928   /* The logic here is that if we are disabling all frame pointer generation
8929      then we do not need to disable leaf frame pointer generation as a
8930      separate operation.  But if we are *only* disabling leaf frame pointer
8931      generation then we set flag_omit_frame_pointer to true, but in
8932      aarch64_frame_pointer_required we return false only for leaf functions.
8933
8934      PR 70044: We have to be careful about being called multiple times for the
8935      same function.  Once we have decided to set flag_omit_frame_pointer just
8936      so that we can omit leaf frame pointers, we must then not interpret a
8937      second call as meaning that all frame pointer generation should be
8938      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8939      non-zero value.  */
8940   if (opts->x_flag_omit_frame_pointer == 2)
8941     opts->x_flag_omit_frame_pointer = 0;
8942
8943   if (opts->x_flag_omit_frame_pointer)
8944     opts->x_flag_omit_leaf_frame_pointer = false;
8945   else if (opts->x_flag_omit_leaf_frame_pointer)
8946     opts->x_flag_omit_frame_pointer = 2;
8947
8948   /* If not optimizing for size, set the default
8949      alignment to what the target wants.  */
8950   if (!opts->x_optimize_size)
8951     {
8952       if (opts->x_align_loops <= 0)
8953         opts->x_align_loops = aarch64_tune_params.loop_align;
8954       if (opts->x_align_jumps <= 0)
8955         opts->x_align_jumps = aarch64_tune_params.jump_align;
8956       if (opts->x_align_functions <= 0)
8957         opts->x_align_functions = aarch64_tune_params.function_align;
8958     }
8959
8960   /* We default to no pc-relative literal loads.  */
8961
8962   aarch64_pcrelative_literal_loads = false;
8963
8964   /* If -mpc-relative-literal-loads is set on the command line, this
8965      implies that the user asked for PC relative literal loads.  */
8966   if (opts->x_pcrelative_literal_loads == 1)
8967     aarch64_pcrelative_literal_loads = true;
8968
8969   /* This is PR70113. When building the Linux kernel with
8970      CONFIG_ARM64_ERRATUM_843419, support for relocations
8971      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8972      removed from the kernel to avoid loading objects with possibly
8973      offending sequences.  Without -mpc-relative-literal-loads we would
8974      generate such relocations, preventing the kernel build from
8975      succeeding.  */
8976   if (opts->x_pcrelative_literal_loads == 2
8977       && TARGET_FIX_ERR_A53_843419)
8978     aarch64_pcrelative_literal_loads = true;
8979
8980   /* In the tiny memory model it makes no sense to disallow PC relative
8981      literal pool loads.  */
8982   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8983       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8984     aarch64_pcrelative_literal_loads = true;
8985
8986   /* When enabling the lower precision Newton series for the square root, also
8987      enable it for the reciprocal square root, since the latter is an
8988      intermediary step for the former.  */
8989   if (flag_mlow_precision_sqrt)
8990     flag_mrecip_low_precision_sqrt = true;
8991 }
8992
8993 /* 'Unpack' up the internal tuning structs and update the options
8994     in OPTS.  The caller must have set up selected_tune and selected_arch
8995     as all the other target-specific codegen decisions are
8996     derived from them.  */
8997
8998 void
8999 aarch64_override_options_internal (struct gcc_options *opts)
9000 {
9001   aarch64_tune_flags = selected_tune->flags;
9002   aarch64_tune = selected_tune->sched_core;
9003   /* Make a copy of the tuning parameters attached to the core, which
9004      we may later overwrite.  */
9005   aarch64_tune_params = *(selected_tune->tune);
9006   aarch64_architecture_version = selected_arch->architecture_version;
9007
9008   if (opts->x_aarch64_override_tune_string)
9009     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9010                                   &aarch64_tune_params);
9011
9012   /* This target defaults to strict volatile bitfields.  */
9013   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9014     opts->x_flag_strict_volatile_bitfields = 1;
9015
9016   initialize_aarch64_code_model (opts);
9017   initialize_aarch64_tls_size (opts);
9018
9019   int queue_depth = 0;
9020   switch (aarch64_tune_params.autoprefetcher_model)
9021     {
9022       case tune_params::AUTOPREFETCHER_OFF:
9023         queue_depth = -1;
9024         break;
9025       case tune_params::AUTOPREFETCHER_WEAK:
9026         queue_depth = 0;
9027         break;
9028       case tune_params::AUTOPREFETCHER_STRONG:
9029         queue_depth = max_insn_queue_index + 1;
9030         break;
9031       default:
9032         gcc_unreachable ();
9033     }
9034
9035   /* We don't mind passing in global_options_set here as we don't use
9036      the *options_set structs anyway.  */
9037   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9038                          queue_depth,
9039                          opts->x_param_values,
9040                          global_options_set.x_param_values);
9041
9042   /* Set up parameters to be used in prefetching algorithm.  Do not
9043      override the defaults unless we are tuning for a core we have
9044      researched values for.  */
9045   if (aarch64_tune_params.prefetch->num_slots > 0)
9046     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9047                            aarch64_tune_params.prefetch->num_slots,
9048                            opts->x_param_values,
9049                            global_options_set.x_param_values);
9050   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9051     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9052                            aarch64_tune_params.prefetch->l1_cache_size,
9053                            opts->x_param_values,
9054                            global_options_set.x_param_values);
9055   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9056     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9057                            aarch64_tune_params.prefetch->l1_cache_line_size,
9058                            opts->x_param_values,
9059                            global_options_set.x_param_values);
9060   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9061     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9062                            aarch64_tune_params.prefetch->l2_cache_size,
9063                            opts->x_param_values,
9064                            global_options_set.x_param_values);
9065
9066   /* Enable sw prefetching at specified optimization level for
9067      CPUS that have prefetch.  Lower optimization level threshold by 1
9068      when profiling is enabled.  */
9069   if (opts->x_flag_prefetch_loop_arrays < 0
9070       && !opts->x_optimize_size
9071       && aarch64_tune_params.prefetch->default_opt_level >= 0
9072       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9073     opts->x_flag_prefetch_loop_arrays = 1;
9074
9075   aarch64_override_options_after_change_1 (opts);
9076 }
9077
9078 /* Print a hint with a suggestion for a core or architecture name that
9079    most closely resembles what the user passed in STR.  ARCH is true if
9080    the user is asking for an architecture name.  ARCH is false if the user
9081    is asking for a core name.  */
9082
9083 static void
9084 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9085 {
9086   auto_vec<const char *> candidates;
9087   const struct processor *entry = arch ? all_architectures : all_cores;
9088   for (; entry->name != NULL; entry++)
9089     candidates.safe_push (entry->name);
9090   char *s;
9091   const char *hint = candidates_list_and_hint (str, s, candidates);
9092   if (hint)
9093     inform (input_location, "valid arguments are: %s;"
9094                              " did you mean %qs?", s, hint);
9095   XDELETEVEC (s);
9096 }
9097
9098 /* Print a hint with a suggestion for a core name that most closely resembles
9099    what the user passed in STR.  */
9100
9101 inline static void
9102 aarch64_print_hint_for_core (const char *str)
9103 {
9104   aarch64_print_hint_for_core_or_arch (str, false);
9105 }
9106
9107 /* Print a hint with a suggestion for an architecture name that most closely
9108    resembles what the user passed in STR.  */
9109
9110 inline static void
9111 aarch64_print_hint_for_arch (const char *str)
9112 {
9113   aarch64_print_hint_for_core_or_arch (str, true);
9114 }
9115
9116 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9117    specified in STR and throw errors if appropriate.  Put the results if
9118    they are valid in RES and ISA_FLAGS.  Return whether the option is
9119    valid.  */
9120
9121 static bool
9122 aarch64_validate_mcpu (const char *str, const struct processor **res,
9123                        unsigned long *isa_flags)
9124 {
9125   enum aarch64_parse_opt_result parse_res
9126     = aarch64_parse_cpu (str, res, isa_flags);
9127
9128   if (parse_res == AARCH64_PARSE_OK)
9129     return true;
9130
9131   switch (parse_res)
9132     {
9133       case AARCH64_PARSE_MISSING_ARG:
9134         error ("missing cpu name in %<-mcpu=%s%>", str);
9135         break;
9136       case AARCH64_PARSE_INVALID_ARG:
9137         error ("unknown value %qs for -mcpu", str);
9138         aarch64_print_hint_for_core (str);
9139         break;
9140       case AARCH64_PARSE_INVALID_FEATURE:
9141         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9142         break;
9143       default:
9144         gcc_unreachable ();
9145     }
9146
9147   return false;
9148 }
9149
9150 /* Validate a command-line -march option.  Parse the arch and extensions
9151    (if any) specified in STR and throw errors if appropriate.  Put the
9152    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9153    option is valid.  */
9154
9155 static bool
9156 aarch64_validate_march (const char *str, const struct processor **res,
9157                          unsigned long *isa_flags)
9158 {
9159   enum aarch64_parse_opt_result parse_res
9160     = aarch64_parse_arch (str, res, isa_flags);
9161
9162   if (parse_res == AARCH64_PARSE_OK)
9163     return true;
9164
9165   switch (parse_res)
9166     {
9167       case AARCH64_PARSE_MISSING_ARG:
9168         error ("missing arch name in %<-march=%s%>", str);
9169         break;
9170       case AARCH64_PARSE_INVALID_ARG:
9171         error ("unknown value %qs for -march", str);
9172         aarch64_print_hint_for_arch (str);
9173         break;
9174       case AARCH64_PARSE_INVALID_FEATURE:
9175         error ("invalid feature modifier in %<-march=%s%>", str);
9176         break;
9177       default:
9178         gcc_unreachable ();
9179     }
9180
9181   return false;
9182 }
9183
9184 /* Validate a command-line -mtune option.  Parse the cpu
9185    specified in STR and throw errors if appropriate.  Put the
9186    result, if it is valid, in RES.  Return whether the option is
9187    valid.  */
9188
9189 static bool
9190 aarch64_validate_mtune (const char *str, const struct processor **res)
9191 {
9192   enum aarch64_parse_opt_result parse_res
9193     = aarch64_parse_tune (str, res);
9194
9195   if (parse_res == AARCH64_PARSE_OK)
9196     return true;
9197
9198   switch (parse_res)
9199     {
9200       case AARCH64_PARSE_MISSING_ARG:
9201         error ("missing cpu name in %<-mtune=%s%>", str);
9202         break;
9203       case AARCH64_PARSE_INVALID_ARG:
9204         error ("unknown value %qs for -mtune", str);
9205         aarch64_print_hint_for_core (str);
9206         break;
9207       default:
9208         gcc_unreachable ();
9209     }
9210   return false;
9211 }
9212
9213 /* Return the CPU corresponding to the enum CPU.
9214    If it doesn't specify a cpu, return the default.  */
9215
9216 static const struct processor *
9217 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9218 {
9219   if (cpu != aarch64_none)
9220     return &all_cores[cpu];
9221
9222   /* The & 0x3f is to extract the bottom 6 bits that encode the
9223      default cpu as selected by the --with-cpu GCC configure option
9224      in config.gcc.
9225      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9226      flags mechanism should be reworked to make it more sane.  */
9227   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9228 }
9229
9230 /* Return the architecture corresponding to the enum ARCH.
9231    If it doesn't specify a valid architecture, return the default.  */
9232
9233 static const struct processor *
9234 aarch64_get_arch (enum aarch64_arch arch)
9235 {
9236   if (arch != aarch64_no_arch)
9237     return &all_architectures[arch];
9238
9239   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9240
9241   return &all_architectures[cpu->arch];
9242 }
9243
9244 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9245    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9246    tuning structs.  In particular it must set selected_tune and
9247    aarch64_isa_flags that define the available ISA features and tuning
9248    decisions.  It must also set selected_arch as this will be used to
9249    output the .arch asm tags for each function.  */
9250
9251 static void
9252 aarch64_override_options (void)
9253 {
9254   unsigned long cpu_isa = 0;
9255   unsigned long arch_isa = 0;
9256   aarch64_isa_flags = 0;
9257
9258   bool valid_cpu = true;
9259   bool valid_tune = true;
9260   bool valid_arch = true;
9261
9262   selected_cpu = NULL;
9263   selected_arch = NULL;
9264   selected_tune = NULL;
9265
9266   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9267      If either of -march or -mtune is given, they override their
9268      respective component of -mcpu.  */
9269   if (aarch64_cpu_string)
9270     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9271                                         &cpu_isa);
9272
9273   if (aarch64_arch_string)
9274     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9275                                           &arch_isa);
9276
9277   if (aarch64_tune_string)
9278     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9279
9280   /* If the user did not specify a processor, choose the default
9281      one for them.  This will be the CPU set during configuration using
9282      --with-cpu, otherwise it is "generic".  */
9283   if (!selected_cpu)
9284     {
9285       if (selected_arch)
9286         {
9287           selected_cpu = &all_cores[selected_arch->ident];
9288           aarch64_isa_flags = arch_isa;
9289           explicit_arch = selected_arch->arch;
9290         }
9291       else
9292         {
9293           /* Get default configure-time CPU.  */
9294           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9295           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9296         }
9297
9298       if (selected_tune)
9299         explicit_tune_core = selected_tune->ident;
9300     }
9301   /* If both -mcpu and -march are specified check that they are architecturally
9302      compatible, warn if they're not and prefer the -march ISA flags.  */
9303   else if (selected_arch)
9304     {
9305       if (selected_arch->arch != selected_cpu->arch)
9306         {
9307           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9308                        all_architectures[selected_cpu->arch].name,
9309                        selected_arch->name);
9310         }
9311       aarch64_isa_flags = arch_isa;
9312       explicit_arch = selected_arch->arch;
9313       explicit_tune_core = selected_tune ? selected_tune->ident
9314                                           : selected_cpu->ident;
9315     }
9316   else
9317     {
9318       /* -mcpu but no -march.  */
9319       aarch64_isa_flags = cpu_isa;
9320       explicit_tune_core = selected_tune ? selected_tune->ident
9321                                           : selected_cpu->ident;
9322       gcc_assert (selected_cpu);
9323       selected_arch = &all_architectures[selected_cpu->arch];
9324       explicit_arch = selected_arch->arch;
9325     }
9326
9327   /* Set the arch as well as we will need it when outputing
9328      the .arch directive in assembly.  */
9329   if (!selected_arch)
9330     {
9331       gcc_assert (selected_cpu);
9332       selected_arch = &all_architectures[selected_cpu->arch];
9333     }
9334
9335   if (!selected_tune)
9336     selected_tune = selected_cpu;
9337
9338 #ifndef HAVE_AS_MABI_OPTION
9339   /* The compiler may have been configured with 2.23.* binutils, which does
9340      not have support for ILP32.  */
9341   if (TARGET_ILP32)
9342     error ("Assembler does not support -mabi=ilp32");
9343 #endif
9344
9345   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9346     sorry ("Return address signing is only supported for -mabi=lp64");
9347
9348   /* Make sure we properly set up the explicit options.  */
9349   if ((aarch64_cpu_string && valid_cpu)
9350        || (aarch64_tune_string && valid_tune))
9351     gcc_assert (explicit_tune_core != aarch64_none);
9352
9353   if ((aarch64_cpu_string && valid_cpu)
9354        || (aarch64_arch_string && valid_arch))
9355     gcc_assert (explicit_arch != aarch64_no_arch);
9356
9357   aarch64_override_options_internal (&global_options);
9358
9359   /* Save these options as the default ones in case we push and pop them later
9360      while processing functions with potential target attributes.  */
9361   target_option_default_node = target_option_current_node
9362       = build_target_option_node (&global_options);
9363 }
9364
9365 /* Implement targetm.override_options_after_change.  */
9366
9367 static void
9368 aarch64_override_options_after_change (void)
9369 {
9370   aarch64_override_options_after_change_1 (&global_options);
9371 }
9372
9373 static struct machine_function *
9374 aarch64_init_machine_status (void)
9375 {
9376   struct machine_function *machine;
9377   machine = ggc_cleared_alloc<machine_function> ();
9378   return machine;
9379 }
9380
9381 void
9382 aarch64_init_expanders (void)
9383 {
9384   init_machine_status = aarch64_init_machine_status;
9385 }
9386
9387 /* A checking mechanism for the implementation of the various code models.  */
9388 static void
9389 initialize_aarch64_code_model (struct gcc_options *opts)
9390 {
9391    if (opts->x_flag_pic)
9392      {
9393        switch (opts->x_aarch64_cmodel_var)
9394          {
9395          case AARCH64_CMODEL_TINY:
9396            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9397            break;
9398          case AARCH64_CMODEL_SMALL:
9399 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9400            aarch64_cmodel = (flag_pic == 2
9401                              ? AARCH64_CMODEL_SMALL_PIC
9402                              : AARCH64_CMODEL_SMALL_SPIC);
9403 #else
9404            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9405 #endif
9406            break;
9407          case AARCH64_CMODEL_LARGE:
9408            sorry ("code model %qs with -f%s", "large",
9409                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9410            break;
9411          default:
9412            gcc_unreachable ();
9413          }
9414      }
9415    else
9416      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9417 }
9418
9419 /* Implement TARGET_OPTION_SAVE.  */
9420
9421 static void
9422 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9423 {
9424   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9425 }
9426
9427 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9428    using the information saved in PTR.  */
9429
9430 static void
9431 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9432 {
9433   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9434   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9435   opts->x_explicit_arch = ptr->x_explicit_arch;
9436   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9437   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9438
9439   aarch64_override_options_internal (opts);
9440 }
9441
9442 /* Implement TARGET_OPTION_PRINT.  */
9443
9444 static void
9445 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9446 {
9447   const struct processor *cpu
9448     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9449   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9450   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9451   std::string extension
9452     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9453
9454   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9455   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9456            arch->name, extension.c_str ());
9457 }
9458
9459 static GTY(()) tree aarch64_previous_fndecl;
9460
9461 void
9462 aarch64_reset_previous_fndecl (void)
9463 {
9464   aarch64_previous_fndecl = NULL;
9465 }
9466
9467 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9468    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9469    make sure optab availability predicates are recomputed when necessary.  */
9470
9471 void
9472 aarch64_save_restore_target_globals (tree new_tree)
9473 {
9474   if (TREE_TARGET_GLOBALS (new_tree))
9475     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9476   else if (new_tree == target_option_default_node)
9477     restore_target_globals (&default_target_globals);
9478   else
9479     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9480 }
9481
9482 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9483    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9484    of the function, if such exists.  This function may be called multiple
9485    times on a single function so use aarch64_previous_fndecl to avoid
9486    setting up identical state.  */
9487
9488 static void
9489 aarch64_set_current_function (tree fndecl)
9490 {
9491   if (!fndecl || fndecl == aarch64_previous_fndecl)
9492     return;
9493
9494   tree old_tree = (aarch64_previous_fndecl
9495                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9496                    : NULL_TREE);
9497
9498   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9499
9500   /* If current function has no attributes but the previous one did,
9501      use the default node.  */
9502   if (!new_tree && old_tree)
9503     new_tree = target_option_default_node;
9504
9505   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9506      the default have been handled by aarch64_save_restore_target_globals from
9507      aarch64_pragma_target_parse.  */
9508   if (old_tree == new_tree)
9509     return;
9510
9511   aarch64_previous_fndecl = fndecl;
9512
9513   /* First set the target options.  */
9514   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9515
9516   aarch64_save_restore_target_globals (new_tree);
9517 }
9518
9519 /* Enum describing the various ways we can handle attributes.
9520    In many cases we can reuse the generic option handling machinery.  */
9521
9522 enum aarch64_attr_opt_type
9523 {
9524   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9525   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9526   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9527   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9528 };
9529
9530 /* All the information needed to handle a target attribute.
9531    NAME is the name of the attribute.
9532    ATTR_TYPE specifies the type of behavior of the attribute as described
9533    in the definition of enum aarch64_attr_opt_type.
9534    ALLOW_NEG is true if the attribute supports a "no-" form.
9535    HANDLER is the function that takes the attribute string and whether
9536    it is a pragma or attribute and handles the option.  It is needed only
9537    when the ATTR_TYPE is aarch64_attr_custom.
9538    OPT_NUM is the enum specifying the option that the attribute modifies.
9539    This is needed for attributes that mirror the behavior of a command-line
9540    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9541    aarch64_attr_enum.  */
9542
9543 struct aarch64_attribute_info
9544 {
9545   const char *name;
9546   enum aarch64_attr_opt_type attr_type;
9547   bool allow_neg;
9548   bool (*handler) (const char *, const char *);
9549   enum opt_code opt_num;
9550 };
9551
9552 /* Handle the ARCH_STR argument to the arch= target attribute.
9553    PRAGMA_OR_ATTR is used in potential error messages.  */
9554
9555 static bool
9556 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9557 {
9558   const struct processor *tmp_arch = NULL;
9559   enum aarch64_parse_opt_result parse_res
9560     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9561
9562   if (parse_res == AARCH64_PARSE_OK)
9563     {
9564       gcc_assert (tmp_arch);
9565       selected_arch = tmp_arch;
9566       explicit_arch = selected_arch->arch;
9567       return true;
9568     }
9569
9570   switch (parse_res)
9571     {
9572       case AARCH64_PARSE_MISSING_ARG:
9573         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9574         break;
9575       case AARCH64_PARSE_INVALID_ARG:
9576         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9577         aarch64_print_hint_for_arch (str);
9578         break;
9579       case AARCH64_PARSE_INVALID_FEATURE:
9580         error ("invalid feature modifier %qs for 'arch' target %s",
9581                str, pragma_or_attr);
9582         break;
9583       default:
9584         gcc_unreachable ();
9585     }
9586
9587   return false;
9588 }
9589
9590 /* Handle the argument CPU_STR to the cpu= target attribute.
9591    PRAGMA_OR_ATTR is used in potential error messages.  */
9592
9593 static bool
9594 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9595 {
9596   const struct processor *tmp_cpu = NULL;
9597   enum aarch64_parse_opt_result parse_res
9598     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9599
9600   if (parse_res == AARCH64_PARSE_OK)
9601     {
9602       gcc_assert (tmp_cpu);
9603       selected_tune = tmp_cpu;
9604       explicit_tune_core = selected_tune->ident;
9605
9606       selected_arch = &all_architectures[tmp_cpu->arch];
9607       explicit_arch = selected_arch->arch;
9608       return true;
9609     }
9610
9611   switch (parse_res)
9612     {
9613       case AARCH64_PARSE_MISSING_ARG:
9614         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9615         break;
9616       case AARCH64_PARSE_INVALID_ARG:
9617         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9618         aarch64_print_hint_for_core (str);
9619         break;
9620       case AARCH64_PARSE_INVALID_FEATURE:
9621         error ("invalid feature modifier %qs for 'cpu' target %s",
9622                str, pragma_or_attr);
9623         break;
9624       default:
9625         gcc_unreachable ();
9626     }
9627
9628   return false;
9629 }
9630
9631 /* Handle the argument STR to the tune= target attribute.
9632    PRAGMA_OR_ATTR is used in potential error messages.  */
9633
9634 static bool
9635 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9636 {
9637   const struct processor *tmp_tune = NULL;
9638   enum aarch64_parse_opt_result parse_res
9639     = aarch64_parse_tune (str, &tmp_tune);
9640
9641   if (parse_res == AARCH64_PARSE_OK)
9642     {
9643       gcc_assert (tmp_tune);
9644       selected_tune = tmp_tune;
9645       explicit_tune_core = selected_tune->ident;
9646       return true;
9647     }
9648
9649   switch (parse_res)
9650     {
9651       case AARCH64_PARSE_INVALID_ARG:
9652         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9653         aarch64_print_hint_for_core (str);
9654         break;
9655       default:
9656         gcc_unreachable ();
9657     }
9658
9659   return false;
9660 }
9661
9662 /* Parse an architecture extensions target attribute string specified in STR.
9663    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9664    if successful.  Update aarch64_isa_flags to reflect the ISA features
9665    modified.
9666    PRAGMA_OR_ATTR is used in potential error messages.  */
9667
9668 static bool
9669 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9670 {
9671   enum aarch64_parse_opt_result parse_res;
9672   unsigned long isa_flags = aarch64_isa_flags;
9673
9674   /* We allow "+nothing" in the beginning to clear out all architectural
9675      features if the user wants to handpick specific features.  */
9676   if (strncmp ("+nothing", str, 8) == 0)
9677     {
9678       isa_flags = 0;
9679       str += 8;
9680     }
9681
9682   parse_res = aarch64_parse_extension (str, &isa_flags);
9683
9684   if (parse_res == AARCH64_PARSE_OK)
9685     {
9686       aarch64_isa_flags = isa_flags;
9687       return true;
9688     }
9689
9690   switch (parse_res)
9691     {
9692       case AARCH64_PARSE_MISSING_ARG:
9693         error ("missing feature modifier in target %s %qs",
9694                pragma_or_attr, str);
9695         break;
9696
9697       case AARCH64_PARSE_INVALID_FEATURE:
9698         error ("invalid feature modifier in target %s %qs",
9699                pragma_or_attr, str);
9700         break;
9701
9702       default:
9703         gcc_unreachable ();
9704     }
9705
9706  return false;
9707 }
9708
9709 /* The target attributes that we support.  On top of these we also support just
9710    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9711    handled explicitly in aarch64_process_one_target_attr.  */
9712
9713 static const struct aarch64_attribute_info aarch64_attributes[] =
9714 {
9715   { "general-regs-only", aarch64_attr_mask, false, NULL,
9716      OPT_mgeneral_regs_only },
9717   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9718      OPT_mfix_cortex_a53_835769 },
9719   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9720      OPT_mfix_cortex_a53_843419 },
9721   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9722   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9723   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9724      OPT_momit_leaf_frame_pointer },
9725   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9726   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9727      OPT_march_ },
9728   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9729   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9730      OPT_mtune_ },
9731   { "sign-return-address", aarch64_attr_enum, false, NULL,
9732      OPT_msign_return_address_ },
9733   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9734 };
9735
9736 /* Parse ARG_STR which contains the definition of one target attribute.
9737    Show appropriate errors if any or return true if the attribute is valid.
9738    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9739    we're processing a target attribute or pragma.  */
9740
9741 static bool
9742 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9743 {
9744   bool invert = false;
9745
9746   size_t len = strlen (arg_str);
9747
9748   if (len == 0)
9749     {
9750       error ("malformed target %s", pragma_or_attr);
9751       return false;
9752     }
9753
9754   char *str_to_check = (char *) alloca (len + 1);
9755   strcpy (str_to_check, arg_str);
9756
9757   /* Skip leading whitespace.  */
9758   while (*str_to_check == ' ' || *str_to_check == '\t')
9759     str_to_check++;
9760
9761   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9762      It is easier to detect and handle it explicitly here rather than going
9763      through the machinery for the rest of the target attributes in this
9764      function.  */
9765   if (*str_to_check == '+')
9766     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9767
9768   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9769     {
9770       invert = true;
9771       str_to_check += 3;
9772     }
9773   char *arg = strchr (str_to_check, '=');
9774
9775   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9776      and point ARG to "foo".  */
9777   if (arg)
9778     {
9779       *arg = '\0';
9780       arg++;
9781     }
9782   const struct aarch64_attribute_info *p_attr;
9783   bool found = false;
9784   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9785     {
9786       /* If the names don't match up, or the user has given an argument
9787          to an attribute that doesn't accept one, or didn't give an argument
9788          to an attribute that expects one, fail to match.  */
9789       if (strcmp (str_to_check, p_attr->name) != 0)
9790         continue;
9791
9792       found = true;
9793       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9794                               || p_attr->attr_type == aarch64_attr_enum;
9795
9796       if (attr_need_arg_p ^ (arg != NULL))
9797         {
9798           error ("target %s %qs does not accept an argument",
9799                   pragma_or_attr, str_to_check);
9800           return false;
9801         }
9802
9803       /* If the name matches but the attribute does not allow "no-" versions
9804          then we can't match.  */
9805       if (invert && !p_attr->allow_neg)
9806         {
9807           error ("target %s %qs does not allow a negated form",
9808                   pragma_or_attr, str_to_check);
9809           return false;
9810         }
9811
9812       switch (p_attr->attr_type)
9813         {
9814         /* Has a custom handler registered.
9815            For example, cpu=, arch=, tune=.  */
9816           case aarch64_attr_custom:
9817             gcc_assert (p_attr->handler);
9818             if (!p_attr->handler (arg, pragma_or_attr))
9819               return false;
9820             break;
9821
9822           /* Either set or unset a boolean option.  */
9823           case aarch64_attr_bool:
9824             {
9825               struct cl_decoded_option decoded;
9826
9827               generate_option (p_attr->opt_num, NULL, !invert,
9828                                CL_TARGET, &decoded);
9829               aarch64_handle_option (&global_options, &global_options_set,
9830                                       &decoded, input_location);
9831               break;
9832             }
9833           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9834              should know what mask to apply given the option number.  */
9835           case aarch64_attr_mask:
9836             {
9837               struct cl_decoded_option decoded;
9838               /* We only need to specify the option number.
9839                  aarch64_handle_option will know which mask to apply.  */
9840               decoded.opt_index = p_attr->opt_num;
9841               decoded.value = !invert;
9842               aarch64_handle_option (&global_options, &global_options_set,
9843                                       &decoded, input_location);
9844               break;
9845             }
9846           /* Use the option setting machinery to set an option to an enum.  */
9847           case aarch64_attr_enum:
9848             {
9849               gcc_assert (arg);
9850               bool valid;
9851               int value;
9852               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9853                                               &value, CL_TARGET);
9854               if (valid)
9855                 {
9856                   set_option (&global_options, NULL, p_attr->opt_num, value,
9857                               NULL, DK_UNSPECIFIED, input_location,
9858                               global_dc);
9859                 }
9860               else
9861                 {
9862                   error ("target %s %s=%s is not valid",
9863                          pragma_or_attr, str_to_check, arg);
9864                 }
9865               break;
9866             }
9867           default:
9868             gcc_unreachable ();
9869         }
9870     }
9871
9872   /* If we reached here we either have found an attribute and validated
9873      it or didn't match any.  If we matched an attribute but its arguments
9874      were malformed we will have returned false already.  */
9875   return found;
9876 }
9877
9878 /* Count how many times the character C appears in
9879    NULL-terminated string STR.  */
9880
9881 static unsigned int
9882 num_occurences_in_str (char c, char *str)
9883 {
9884   unsigned int res = 0;
9885   while (*str != '\0')
9886     {
9887       if (*str == c)
9888         res++;
9889
9890       str++;
9891     }
9892
9893   return res;
9894 }
9895
9896 /* Parse the tree in ARGS that contains the target attribute information
9897    and update the global target options space.  PRAGMA_OR_ATTR is a string
9898    to be used in error messages, specifying whether this is processing
9899    a target attribute or a target pragma.  */
9900
9901 bool
9902 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9903 {
9904   if (TREE_CODE (args) == TREE_LIST)
9905     {
9906       do
9907         {
9908           tree head = TREE_VALUE (args);
9909           if (head)
9910             {
9911               if (!aarch64_process_target_attr (head, pragma_or_attr))
9912                 return false;
9913             }
9914           args = TREE_CHAIN (args);
9915         } while (args);
9916
9917       return true;
9918     }
9919
9920   if (TREE_CODE (args) != STRING_CST)
9921     {
9922       error ("attribute %<target%> argument not a string");
9923       return false;
9924     }
9925
9926   size_t len = strlen (TREE_STRING_POINTER (args));
9927   char *str_to_check = (char *) alloca (len + 1);
9928   strcpy (str_to_check, TREE_STRING_POINTER (args));
9929
9930   if (len == 0)
9931     {
9932       error ("malformed target %s value", pragma_or_attr);
9933       return false;
9934     }
9935
9936   /* Used to catch empty spaces between commas i.e.
9937      attribute ((target ("attr1,,attr2"))).  */
9938   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9939
9940   /* Handle multiple target attributes separated by ','.  */
9941   char *token = strtok (str_to_check, ",");
9942
9943   unsigned int num_attrs = 0;
9944   while (token)
9945     {
9946       num_attrs++;
9947       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9948         {
9949           error ("target %s %qs is invalid", pragma_or_attr, token);
9950           return false;
9951         }
9952
9953       token = strtok (NULL, ",");
9954     }
9955
9956   if (num_attrs != num_commas + 1)
9957     {
9958       error ("malformed target %s list %qs",
9959               pragma_or_attr, TREE_STRING_POINTER (args));
9960       return false;
9961     }
9962
9963   return true;
9964 }
9965
9966 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9967    process attribute ((target ("..."))).  */
9968
9969 static bool
9970 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9971 {
9972   struct cl_target_option cur_target;
9973   bool ret;
9974   tree old_optimize;
9975   tree new_target, new_optimize;
9976   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9977
9978   /* If what we're processing is the current pragma string then the
9979      target option node is already stored in target_option_current_node
9980      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9981      having to re-parse the string.  This is especially useful to keep
9982      arm_neon.h compile times down since that header contains a lot
9983      of intrinsics enclosed in pragmas.  */
9984   if (!existing_target && args == current_target_pragma)
9985     {
9986       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9987       return true;
9988     }
9989   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9990
9991   old_optimize = build_optimization_node (&global_options);
9992   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9993
9994   /* If the function changed the optimization levels as well as setting
9995      target options, start with the optimizations specified.  */
9996   if (func_optimize && func_optimize != old_optimize)
9997     cl_optimization_restore (&global_options,
9998                              TREE_OPTIMIZATION (func_optimize));
9999
10000   /* Save the current target options to restore at the end.  */
10001   cl_target_option_save (&cur_target, &global_options);
10002
10003   /* If fndecl already has some target attributes applied to it, unpack
10004      them so that we add this attribute on top of them, rather than
10005      overwriting them.  */
10006   if (existing_target)
10007     {
10008       struct cl_target_option *existing_options
10009         = TREE_TARGET_OPTION (existing_target);
10010
10011       if (existing_options)
10012         cl_target_option_restore (&global_options, existing_options);
10013     }
10014   else
10015     cl_target_option_restore (&global_options,
10016                         TREE_TARGET_OPTION (target_option_current_node));
10017
10018
10019   ret = aarch64_process_target_attr (args, "attribute");
10020
10021   /* Set up any additional state.  */
10022   if (ret)
10023     {
10024       aarch64_override_options_internal (&global_options);
10025       /* Initialize SIMD builtins if we haven't already.
10026          Set current_target_pragma to NULL for the duration so that
10027          the builtin initialization code doesn't try to tag the functions
10028          being built with the attributes specified by any current pragma, thus
10029          going into an infinite recursion.  */
10030       if (TARGET_SIMD)
10031         {
10032           tree saved_current_target_pragma = current_target_pragma;
10033           current_target_pragma = NULL;
10034           aarch64_init_simd_builtins ();
10035           current_target_pragma = saved_current_target_pragma;
10036         }
10037       new_target = build_target_option_node (&global_options);
10038     }
10039   else
10040     new_target = NULL;
10041
10042   new_optimize = build_optimization_node (&global_options);
10043
10044   if (fndecl && ret)
10045     {
10046       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10047
10048       if (old_optimize != new_optimize)
10049         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10050     }
10051
10052   cl_target_option_restore (&global_options, &cur_target);
10053
10054   if (old_optimize != new_optimize)
10055     cl_optimization_restore (&global_options,
10056                              TREE_OPTIMIZATION (old_optimize));
10057   return ret;
10058 }
10059
10060 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10061    tri-bool options (yes, no, don't care) and the default value is
10062    DEF, determine whether to reject inlining.  */
10063
10064 static bool
10065 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10066                                      int dont_care, int def)
10067 {
10068   /* If the callee doesn't care, always allow inlining.  */
10069   if (callee == dont_care)
10070     return true;
10071
10072   /* If the caller doesn't care, always allow inlining.  */
10073   if (caller == dont_care)
10074     return true;
10075
10076   /* Otherwise, allow inlining if either the callee and caller values
10077      agree, or if the callee is using the default value.  */
10078   return (callee == caller || callee == def);
10079 }
10080
10081 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10082    to inline CALLEE into CALLER based on target-specific info.
10083    Make sure that the caller and callee have compatible architectural
10084    features.  Then go through the other possible target attributes
10085    and see if they can block inlining.  Try not to reject always_inline
10086    callees unless they are incompatible architecturally.  */
10087
10088 static bool
10089 aarch64_can_inline_p (tree caller, tree callee)
10090 {
10091   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10092   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10093
10094   /* If callee has no option attributes, then it is ok to inline.  */
10095   if (!callee_tree)
10096     return true;
10097
10098   struct cl_target_option *caller_opts
10099         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10100                                            : target_option_default_node);
10101
10102   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10103
10104
10105   /* Callee's ISA flags should be a subset of the caller's.  */
10106   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10107        != callee_opts->x_aarch64_isa_flags)
10108     return false;
10109
10110   /* Allow non-strict aligned functions inlining into strict
10111      aligned ones.  */
10112   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10113        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10114       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10115            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10116     return false;
10117
10118   bool always_inline = lookup_attribute ("always_inline",
10119                                           DECL_ATTRIBUTES (callee));
10120
10121   /* If the architectural features match up and the callee is always_inline
10122      then the other attributes don't matter.  */
10123   if (always_inline)
10124     return true;
10125
10126   if (caller_opts->x_aarch64_cmodel_var
10127       != callee_opts->x_aarch64_cmodel_var)
10128     return false;
10129
10130   if (caller_opts->x_aarch64_tls_dialect
10131       != callee_opts->x_aarch64_tls_dialect)
10132     return false;
10133
10134   /* Honour explicit requests to workaround errata.  */
10135   if (!aarch64_tribools_ok_for_inlining_p (
10136           caller_opts->x_aarch64_fix_a53_err835769,
10137           callee_opts->x_aarch64_fix_a53_err835769,
10138           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10139     return false;
10140
10141   if (!aarch64_tribools_ok_for_inlining_p (
10142           caller_opts->x_aarch64_fix_a53_err843419,
10143           callee_opts->x_aarch64_fix_a53_err843419,
10144           2, TARGET_FIX_ERR_A53_843419))
10145     return false;
10146
10147   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10148      caller and calle and they don't match up, reject inlining.  */
10149   if (!aarch64_tribools_ok_for_inlining_p (
10150           caller_opts->x_flag_omit_leaf_frame_pointer,
10151           callee_opts->x_flag_omit_leaf_frame_pointer,
10152           2, 1))
10153     return false;
10154
10155   /* If the callee has specific tuning overrides, respect them.  */
10156   if (callee_opts->x_aarch64_override_tune_string != NULL
10157       && caller_opts->x_aarch64_override_tune_string == NULL)
10158     return false;
10159
10160   /* If the user specified tuning override strings for the
10161      caller and callee and they don't match up, reject inlining.
10162      We just do a string compare here, we don't analyze the meaning
10163      of the string, as it would be too costly for little gain.  */
10164   if (callee_opts->x_aarch64_override_tune_string
10165       && caller_opts->x_aarch64_override_tune_string
10166       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10167                   caller_opts->x_aarch64_override_tune_string) != 0))
10168     return false;
10169
10170   return true;
10171 }
10172
10173 /* Return true if SYMBOL_REF X binds locally.  */
10174
10175 static bool
10176 aarch64_symbol_binds_local_p (const_rtx x)
10177 {
10178   return (SYMBOL_REF_DECL (x)
10179           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10180           : SYMBOL_REF_LOCAL_P (x));
10181 }
10182
10183 /* Return true if SYMBOL_REF X is thread local */
10184 static bool
10185 aarch64_tls_symbol_p (rtx x)
10186 {
10187   if (! TARGET_HAVE_TLS)
10188     return false;
10189
10190   if (GET_CODE (x) != SYMBOL_REF)
10191     return false;
10192
10193   return SYMBOL_REF_TLS_MODEL (x) != 0;
10194 }
10195
10196 /* Classify a TLS symbol into one of the TLS kinds.  */
10197 enum aarch64_symbol_type
10198 aarch64_classify_tls_symbol (rtx x)
10199 {
10200   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10201
10202   switch (tls_kind)
10203     {
10204     case TLS_MODEL_GLOBAL_DYNAMIC:
10205     case TLS_MODEL_LOCAL_DYNAMIC:
10206       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10207
10208     case TLS_MODEL_INITIAL_EXEC:
10209       switch (aarch64_cmodel)
10210         {
10211         case AARCH64_CMODEL_TINY:
10212         case AARCH64_CMODEL_TINY_PIC:
10213           return SYMBOL_TINY_TLSIE;
10214         default:
10215           return SYMBOL_SMALL_TLSIE;
10216         }
10217
10218     case TLS_MODEL_LOCAL_EXEC:
10219       if (aarch64_tls_size == 12)
10220         return SYMBOL_TLSLE12;
10221       else if (aarch64_tls_size == 24)
10222         return SYMBOL_TLSLE24;
10223       else if (aarch64_tls_size == 32)
10224         return SYMBOL_TLSLE32;
10225       else if (aarch64_tls_size == 48)
10226         return SYMBOL_TLSLE48;
10227       else
10228         gcc_unreachable ();
10229
10230     case TLS_MODEL_EMULATED:
10231     case TLS_MODEL_NONE:
10232       return SYMBOL_FORCE_TO_MEM;
10233
10234     default:
10235       gcc_unreachable ();
10236     }
10237 }
10238
10239 /* Return the method that should be used to access SYMBOL_REF or
10240    LABEL_REF X.  */
10241
10242 enum aarch64_symbol_type
10243 aarch64_classify_symbol (rtx x, rtx offset)
10244 {
10245   if (GET_CODE (x) == LABEL_REF)
10246     {
10247       switch (aarch64_cmodel)
10248         {
10249         case AARCH64_CMODEL_LARGE:
10250           return SYMBOL_FORCE_TO_MEM;
10251
10252         case AARCH64_CMODEL_TINY_PIC:
10253         case AARCH64_CMODEL_TINY:
10254           return SYMBOL_TINY_ABSOLUTE;
10255
10256         case AARCH64_CMODEL_SMALL_SPIC:
10257         case AARCH64_CMODEL_SMALL_PIC:
10258         case AARCH64_CMODEL_SMALL:
10259           return SYMBOL_SMALL_ABSOLUTE;
10260
10261         default:
10262           gcc_unreachable ();
10263         }
10264     }
10265
10266   if (GET_CODE (x) == SYMBOL_REF)
10267     {
10268       if (aarch64_tls_symbol_p (x))
10269         return aarch64_classify_tls_symbol (x);
10270
10271       switch (aarch64_cmodel)
10272         {
10273         case AARCH64_CMODEL_TINY:
10274           /* When we retrieve symbol + offset address, we have to make sure
10275              the offset does not cause overflow of the final address.  But
10276              we have no way of knowing the address of symbol at compile time
10277              so we can't accurately say if the distance between the PC and
10278              symbol + offset is outside the addressible range of +/-1M in the
10279              TINY code model.  So we rely on images not being greater than
10280              1M and cap the offset at 1M and anything beyond 1M will have to
10281              be loaded using an alternative mechanism.  Furthermore if the
10282              symbol is a weak reference to something that isn't known to
10283              resolve to a symbol in this module, then force to memory.  */
10284           if ((SYMBOL_REF_WEAK (x)
10285                && !aarch64_symbol_binds_local_p (x))
10286               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10287             return SYMBOL_FORCE_TO_MEM;
10288           return SYMBOL_TINY_ABSOLUTE;
10289
10290         case AARCH64_CMODEL_SMALL:
10291           /* Same reasoning as the tiny code model, but the offset cap here is
10292              4G.  */
10293           if ((SYMBOL_REF_WEAK (x)
10294                && !aarch64_symbol_binds_local_p (x))
10295               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10296                             HOST_WIDE_INT_C (4294967264)))
10297             return SYMBOL_FORCE_TO_MEM;
10298           return SYMBOL_SMALL_ABSOLUTE;
10299
10300         case AARCH64_CMODEL_TINY_PIC:
10301           if (!aarch64_symbol_binds_local_p (x))
10302             return SYMBOL_TINY_GOT;
10303           return SYMBOL_TINY_ABSOLUTE;
10304
10305         case AARCH64_CMODEL_SMALL_SPIC:
10306         case AARCH64_CMODEL_SMALL_PIC:
10307           if (!aarch64_symbol_binds_local_p (x))
10308             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10309                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10310           return SYMBOL_SMALL_ABSOLUTE;
10311
10312         case AARCH64_CMODEL_LARGE:
10313           /* This is alright even in PIC code as the constant
10314              pool reference is always PC relative and within
10315              the same translation unit.  */
10316           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10317             return SYMBOL_SMALL_ABSOLUTE;
10318           else
10319             return SYMBOL_FORCE_TO_MEM;
10320
10321         default:
10322           gcc_unreachable ();
10323         }
10324     }
10325
10326   /* By default push everything into the constant pool.  */
10327   return SYMBOL_FORCE_TO_MEM;
10328 }
10329
10330 bool
10331 aarch64_constant_address_p (rtx x)
10332 {
10333   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10334 }
10335
10336 bool
10337 aarch64_legitimate_pic_operand_p (rtx x)
10338 {
10339   if (GET_CODE (x) == SYMBOL_REF
10340       || (GET_CODE (x) == CONST
10341           && GET_CODE (XEXP (x, 0)) == PLUS
10342           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10343      return false;
10344
10345   return true;
10346 }
10347
10348 /* Return true if X holds either a quarter-precision or
10349      floating-point +0.0 constant.  */
10350 static bool
10351 aarch64_valid_floating_const (rtx x)
10352 {
10353   if (!CONST_DOUBLE_P (x))
10354     return false;
10355
10356   /* This call determines which constants can be used in mov<mode>
10357      as integer moves instead of constant loads.  */
10358   if (aarch64_float_const_rtx_p (x))
10359     return true;
10360
10361   return aarch64_float_const_representable_p (x);
10362 }
10363
10364 static bool
10365 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10366 {
10367   /* Do not allow vector struct mode constants.  We could support
10368      0 and -1 easily, but they need support in aarch64-simd.md.  */
10369   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10370     return false;
10371
10372   /* For these cases we never want to use a literal load.
10373      As such we have to prevent the compiler from forcing these
10374      to memory.  */
10375   if ((GET_CODE (x) == CONST_VECTOR
10376        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10377       || CONST_INT_P (x)
10378       || aarch64_valid_floating_const (x)
10379       || aarch64_can_const_movi_rtx_p (x, mode)
10380       || aarch64_float_const_rtx_p (x))
10381         return !targetm.cannot_force_const_mem (mode, x);
10382
10383   if (GET_CODE (x) == HIGH
10384       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10385     return true;
10386
10387   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10388      so spilling them is better than rematerialization.  */
10389   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10390     return true;
10391
10392   return aarch64_constant_address_p (x);
10393 }
10394
10395 rtx
10396 aarch64_load_tp (rtx target)
10397 {
10398   if (!target
10399       || GET_MODE (target) != Pmode
10400       || !register_operand (target, Pmode))
10401     target = gen_reg_rtx (Pmode);
10402
10403   /* Can return in any reg.  */
10404   emit_insn (gen_aarch64_load_tp_hard (target));
10405   return target;
10406 }
10407
10408 /* On AAPCS systems, this is the "struct __va_list".  */
10409 static GTY(()) tree va_list_type;
10410
10411 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10412    Return the type to use as __builtin_va_list.
10413
10414    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10415
10416    struct __va_list
10417    {
10418      void *__stack;
10419      void *__gr_top;
10420      void *__vr_top;
10421      int   __gr_offs;
10422      int   __vr_offs;
10423    };  */
10424
10425 static tree
10426 aarch64_build_builtin_va_list (void)
10427 {
10428   tree va_list_name;
10429   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10430
10431   /* Create the type.  */
10432   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10433   /* Give it the required name.  */
10434   va_list_name = build_decl (BUILTINS_LOCATION,
10435                              TYPE_DECL,
10436                              get_identifier ("__va_list"),
10437                              va_list_type);
10438   DECL_ARTIFICIAL (va_list_name) = 1;
10439   TYPE_NAME (va_list_type) = va_list_name;
10440   TYPE_STUB_DECL (va_list_type) = va_list_name;
10441
10442   /* Create the fields.  */
10443   f_stack = build_decl (BUILTINS_LOCATION,
10444                         FIELD_DECL, get_identifier ("__stack"),
10445                         ptr_type_node);
10446   f_grtop = build_decl (BUILTINS_LOCATION,
10447                         FIELD_DECL, get_identifier ("__gr_top"),
10448                         ptr_type_node);
10449   f_vrtop = build_decl (BUILTINS_LOCATION,
10450                         FIELD_DECL, get_identifier ("__vr_top"),
10451                         ptr_type_node);
10452   f_groff = build_decl (BUILTINS_LOCATION,
10453                         FIELD_DECL, get_identifier ("__gr_offs"),
10454                         integer_type_node);
10455   f_vroff = build_decl (BUILTINS_LOCATION,
10456                         FIELD_DECL, get_identifier ("__vr_offs"),
10457                         integer_type_node);
10458
10459   /* Tell tree-stdarg pass about our internal offset fields.
10460      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10461      purpose to identify whether the code is updating va_list internal
10462      offset fields through irregular way.  */
10463   va_list_gpr_counter_field = f_groff;
10464   va_list_fpr_counter_field = f_vroff;
10465
10466   DECL_ARTIFICIAL (f_stack) = 1;
10467   DECL_ARTIFICIAL (f_grtop) = 1;
10468   DECL_ARTIFICIAL (f_vrtop) = 1;
10469   DECL_ARTIFICIAL (f_groff) = 1;
10470   DECL_ARTIFICIAL (f_vroff) = 1;
10471
10472   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10473   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10474   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10475   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10476   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10477
10478   TYPE_FIELDS (va_list_type) = f_stack;
10479   DECL_CHAIN (f_stack) = f_grtop;
10480   DECL_CHAIN (f_grtop) = f_vrtop;
10481   DECL_CHAIN (f_vrtop) = f_groff;
10482   DECL_CHAIN (f_groff) = f_vroff;
10483
10484   /* Compute its layout.  */
10485   layout_type (va_list_type);
10486
10487   return va_list_type;
10488 }
10489
10490 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10491 static void
10492 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10493 {
10494   const CUMULATIVE_ARGS *cum;
10495   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10496   tree stack, grtop, vrtop, groff, vroff;
10497   tree t;
10498   int gr_save_area_size = cfun->va_list_gpr_size;
10499   int vr_save_area_size = cfun->va_list_fpr_size;
10500   int vr_offset;
10501
10502   cum = &crtl->args.info;
10503   if (cfun->va_list_gpr_size)
10504     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10505                              cfun->va_list_gpr_size);
10506   if (cfun->va_list_fpr_size)
10507     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10508                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10509
10510   if (!TARGET_FLOAT)
10511     {
10512       gcc_assert (cum->aapcs_nvrn == 0);
10513       vr_save_area_size = 0;
10514     }
10515
10516   f_stack = TYPE_FIELDS (va_list_type_node);
10517   f_grtop = DECL_CHAIN (f_stack);
10518   f_vrtop = DECL_CHAIN (f_grtop);
10519   f_groff = DECL_CHAIN (f_vrtop);
10520   f_vroff = DECL_CHAIN (f_groff);
10521
10522   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10523                   NULL_TREE);
10524   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10525                   NULL_TREE);
10526   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10527                   NULL_TREE);
10528   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10529                   NULL_TREE);
10530   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10531                   NULL_TREE);
10532
10533   /* Emit code to initialize STACK, which points to the next varargs stack
10534      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10535      by named arguments.  STACK is 8-byte aligned.  */
10536   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10537   if (cum->aapcs_stack_size > 0)
10538     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10539   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10540   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10541
10542   /* Emit code to initialize GRTOP, the top of the GR save area.
10543      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10544   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10545   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10546   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10547
10548   /* Emit code to initialize VRTOP, the top of the VR save area.
10549      This address is gr_save_area_bytes below GRTOP, rounded
10550      down to the next 16-byte boundary.  */
10551   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10552   vr_offset = ROUND_UP (gr_save_area_size,
10553                         STACK_BOUNDARY / BITS_PER_UNIT);
10554
10555   if (vr_offset)
10556     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10557   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10558   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10559
10560   /* Emit code to initialize GROFF, the offset from GRTOP of the
10561      next GPR argument.  */
10562   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10563               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10564   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10565
10566   /* Likewise emit code to initialize VROFF, the offset from FTOP
10567      of the next VR argument.  */
10568   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10569               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10570   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10571 }
10572
10573 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10574
10575 static tree
10576 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10577                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10578 {
10579   tree addr;
10580   bool indirect_p;
10581   bool is_ha;           /* is HFA or HVA.  */
10582   bool dw_align;        /* double-word align.  */
10583   machine_mode ag_mode = VOIDmode;
10584   int nregs;
10585   machine_mode mode;
10586
10587   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10588   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10589   HOST_WIDE_INT size, rsize, adjust, align;
10590   tree t, u, cond1, cond2;
10591
10592   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10593   if (indirect_p)
10594     type = build_pointer_type (type);
10595
10596   mode = TYPE_MODE (type);
10597
10598   f_stack = TYPE_FIELDS (va_list_type_node);
10599   f_grtop = DECL_CHAIN (f_stack);
10600   f_vrtop = DECL_CHAIN (f_grtop);
10601   f_groff = DECL_CHAIN (f_vrtop);
10602   f_vroff = DECL_CHAIN (f_groff);
10603
10604   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10605                   f_stack, NULL_TREE);
10606   size = int_size_in_bytes (type);
10607   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10608
10609   dw_align = false;
10610   adjust = 0;
10611   if (aarch64_vfp_is_call_or_return_candidate (mode,
10612                                                type,
10613                                                &ag_mode,
10614                                                &nregs,
10615                                                &is_ha))
10616     {
10617       /* TYPE passed in fp/simd registers.  */
10618       if (!TARGET_FLOAT)
10619         aarch64_err_no_fpadvsimd (mode, "varargs");
10620
10621       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10622                       unshare_expr (valist), f_vrtop, NULL_TREE);
10623       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10624                       unshare_expr (valist), f_vroff, NULL_TREE);
10625
10626       rsize = nregs * UNITS_PER_VREG;
10627
10628       if (is_ha)
10629         {
10630           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10631             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10632         }
10633       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10634                && size < UNITS_PER_VREG)
10635         {
10636           adjust = UNITS_PER_VREG - size;
10637         }
10638     }
10639   else
10640     {
10641       /* TYPE passed in general registers.  */
10642       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10643                       unshare_expr (valist), f_grtop, NULL_TREE);
10644       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10645                       unshare_expr (valist), f_groff, NULL_TREE);
10646       rsize = ROUND_UP (size, UNITS_PER_WORD);
10647       nregs = rsize / UNITS_PER_WORD;
10648
10649       if (align > 8)
10650         dw_align = true;
10651
10652       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10653           && size < UNITS_PER_WORD)
10654         {
10655           adjust = UNITS_PER_WORD  - size;
10656         }
10657     }
10658
10659   /* Get a local temporary for the field value.  */
10660   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10661
10662   /* Emit code to branch if off >= 0.  */
10663   t = build2 (GE_EXPR, boolean_type_node, off,
10664               build_int_cst (TREE_TYPE (off), 0));
10665   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10666
10667   if (dw_align)
10668     {
10669       /* Emit: offs = (offs + 15) & -16.  */
10670       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10671                   build_int_cst (TREE_TYPE (off), 15));
10672       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10673                   build_int_cst (TREE_TYPE (off), -16));
10674       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10675     }
10676   else
10677     roundup = NULL;
10678
10679   /* Update ap.__[g|v]r_offs  */
10680   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10681               build_int_cst (TREE_TYPE (off), rsize));
10682   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10683
10684   /* String up.  */
10685   if (roundup)
10686     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10687
10688   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10689   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10690               build_int_cst (TREE_TYPE (f_off), 0));
10691   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10692
10693   /* String up: make sure the assignment happens before the use.  */
10694   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10695   COND_EXPR_ELSE (cond1) = t;
10696
10697   /* Prepare the trees handling the argument that is passed on the stack;
10698      the top level node will store in ON_STACK.  */
10699   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10700   if (align > 8)
10701     {
10702       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10703       t = fold_convert (intDI_type_node, arg);
10704       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10705                   build_int_cst (TREE_TYPE (t), 15));
10706       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10707                   build_int_cst (TREE_TYPE (t), -16));
10708       t = fold_convert (TREE_TYPE (arg), t);
10709       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10710     }
10711   else
10712     roundup = NULL;
10713   /* Advance ap.__stack  */
10714   t = fold_convert (intDI_type_node, arg);
10715   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10716               build_int_cst (TREE_TYPE (t), size + 7));
10717   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10718               build_int_cst (TREE_TYPE (t), -8));
10719   t = fold_convert (TREE_TYPE (arg), t);
10720   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10721   /* String up roundup and advance.  */
10722   if (roundup)
10723     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10724   /* String up with arg */
10725   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10726   /* Big-endianness related address adjustment.  */
10727   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10728       && size < UNITS_PER_WORD)
10729   {
10730     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10731                 size_int (UNITS_PER_WORD - size));
10732     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10733   }
10734
10735   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10736   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10737
10738   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10739   t = off;
10740   if (adjust)
10741     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10742                 build_int_cst (TREE_TYPE (off), adjust));
10743
10744   t = fold_convert (sizetype, t);
10745   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10746
10747   if (is_ha)
10748     {
10749       /* type ha; // treat as "struct {ftype field[n];}"
10750          ... [computing offs]
10751          for (i = 0; i <nregs; ++i, offs += 16)
10752            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10753          return ha;  */
10754       int i;
10755       tree tmp_ha, field_t, field_ptr_t;
10756
10757       /* Declare a local variable.  */
10758       tmp_ha = create_tmp_var_raw (type, "ha");
10759       gimple_add_tmp_var (tmp_ha);
10760
10761       /* Establish the base type.  */
10762       switch (ag_mode)
10763         {
10764         case E_SFmode:
10765           field_t = float_type_node;
10766           field_ptr_t = float_ptr_type_node;
10767           break;
10768         case E_DFmode:
10769           field_t = double_type_node;
10770           field_ptr_t = double_ptr_type_node;
10771           break;
10772         case E_TFmode:
10773           field_t = long_double_type_node;
10774           field_ptr_t = long_double_ptr_type_node;
10775           break;
10776         case E_HFmode:
10777           field_t = aarch64_fp16_type_node;
10778           field_ptr_t = aarch64_fp16_ptr_type_node;
10779           break;
10780         case E_V2SImode:
10781         case E_V4SImode:
10782             {
10783               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10784               field_t = build_vector_type_for_mode (innertype, ag_mode);
10785               field_ptr_t = build_pointer_type (field_t);
10786             }
10787           break;
10788         default:
10789           gcc_assert (0);
10790         }
10791
10792       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10793       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10794       addr = t;
10795       t = fold_convert (field_ptr_t, addr);
10796       t = build2 (MODIFY_EXPR, field_t,
10797                   build1 (INDIRECT_REF, field_t, tmp_ha),
10798                   build1 (INDIRECT_REF, field_t, t));
10799
10800       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10801       for (i = 1; i < nregs; ++i)
10802         {
10803           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10804           u = fold_convert (field_ptr_t, addr);
10805           u = build2 (MODIFY_EXPR, field_t,
10806                       build2 (MEM_REF, field_t, tmp_ha,
10807                               build_int_cst (field_ptr_t,
10808                                              (i *
10809                                               int_size_in_bytes (field_t)))),
10810                       build1 (INDIRECT_REF, field_t, u));
10811           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10812         }
10813
10814       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10815       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10816     }
10817
10818   COND_EXPR_ELSE (cond2) = t;
10819   addr = fold_convert (build_pointer_type (type), cond1);
10820   addr = build_va_arg_indirect_ref (addr);
10821
10822   if (indirect_p)
10823     addr = build_va_arg_indirect_ref (addr);
10824
10825   return addr;
10826 }
10827
10828 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10829
10830 static void
10831 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10832                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10833                                 int no_rtl)
10834 {
10835   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10836   CUMULATIVE_ARGS local_cum;
10837   int gr_saved = cfun->va_list_gpr_size;
10838   int vr_saved = cfun->va_list_fpr_size;
10839
10840   /* The caller has advanced CUM up to, but not beyond, the last named
10841      argument.  Advance a local copy of CUM past the last "real" named
10842      argument, to find out how many registers are left over.  */
10843   local_cum = *cum;
10844   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10845
10846   /* Found out how many registers we need to save.
10847      Honor tree-stdvar analysis results.  */
10848   if (cfun->va_list_gpr_size)
10849     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10850                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10851   if (cfun->va_list_fpr_size)
10852     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10853                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10854
10855   if (!TARGET_FLOAT)
10856     {
10857       gcc_assert (local_cum.aapcs_nvrn == 0);
10858       vr_saved = 0;
10859     }
10860
10861   if (!no_rtl)
10862     {
10863       if (gr_saved > 0)
10864         {
10865           rtx ptr, mem;
10866
10867           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10868           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10869                                - gr_saved * UNITS_PER_WORD);
10870           mem = gen_frame_mem (BLKmode, ptr);
10871           set_mem_alias_set (mem, get_varargs_alias_set ());
10872
10873           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10874                                mem, gr_saved);
10875         }
10876       if (vr_saved > 0)
10877         {
10878           /* We can't use move_block_from_reg, because it will use
10879              the wrong mode, storing D regs only.  */
10880           machine_mode mode = TImode;
10881           int off, i, vr_start;
10882
10883           /* Set OFF to the offset from virtual_incoming_args_rtx of
10884              the first vector register.  The VR save area lies below
10885              the GR one, and is aligned to 16 bytes.  */
10886           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10887                            STACK_BOUNDARY / BITS_PER_UNIT);
10888           off -= vr_saved * UNITS_PER_VREG;
10889
10890           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10891           for (i = 0; i < vr_saved; ++i)
10892             {
10893               rtx ptr, mem;
10894
10895               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10896               mem = gen_frame_mem (mode, ptr);
10897               set_mem_alias_set (mem, get_varargs_alias_set ());
10898               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10899               off += UNITS_PER_VREG;
10900             }
10901         }
10902     }
10903
10904   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10905      any complication of having crtl->args.pretend_args_size changed.  */
10906   cfun->machine->frame.saved_varargs_size
10907     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10908                  STACK_BOUNDARY / BITS_PER_UNIT)
10909        + vr_saved * UNITS_PER_VREG);
10910 }
10911
10912 static void
10913 aarch64_conditional_register_usage (void)
10914 {
10915   int i;
10916   if (!TARGET_FLOAT)
10917     {
10918       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10919         {
10920           fixed_regs[i] = 1;
10921           call_used_regs[i] = 1;
10922         }
10923     }
10924 }
10925
10926 /* Walk down the type tree of TYPE counting consecutive base elements.
10927    If *MODEP is VOIDmode, then set it to the first valid floating point
10928    type.  If a non-floating point type is found, or if a floating point
10929    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10930    otherwise return the count in the sub-tree.  */
10931 static int
10932 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10933 {
10934   machine_mode mode;
10935   HOST_WIDE_INT size;
10936
10937   switch (TREE_CODE (type))
10938     {
10939     case REAL_TYPE:
10940       mode = TYPE_MODE (type);
10941       if (mode != DFmode && mode != SFmode
10942           && mode != TFmode && mode != HFmode)
10943         return -1;
10944
10945       if (*modep == VOIDmode)
10946         *modep = mode;
10947
10948       if (*modep == mode)
10949         return 1;
10950
10951       break;
10952
10953     case COMPLEX_TYPE:
10954       mode = TYPE_MODE (TREE_TYPE (type));
10955       if (mode != DFmode && mode != SFmode
10956           && mode != TFmode && mode != HFmode)
10957         return -1;
10958
10959       if (*modep == VOIDmode)
10960         *modep = mode;
10961
10962       if (*modep == mode)
10963         return 2;
10964
10965       break;
10966
10967     case VECTOR_TYPE:
10968       /* Use V2SImode and V4SImode as representatives of all 64-bit
10969          and 128-bit vector types.  */
10970       size = int_size_in_bytes (type);
10971       switch (size)
10972         {
10973         case 8:
10974           mode = V2SImode;
10975           break;
10976         case 16:
10977           mode = V4SImode;
10978           break;
10979         default:
10980           return -1;
10981         }
10982
10983       if (*modep == VOIDmode)
10984         *modep = mode;
10985
10986       /* Vector modes are considered to be opaque: two vectors are
10987          equivalent for the purposes of being homogeneous aggregates
10988          if they are the same size.  */
10989       if (*modep == mode)
10990         return 1;
10991
10992       break;
10993
10994     case ARRAY_TYPE:
10995       {
10996         int count;
10997         tree index = TYPE_DOMAIN (type);
10998
10999         /* Can't handle incomplete types nor sizes that are not
11000            fixed.  */
11001         if (!COMPLETE_TYPE_P (type)
11002             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11003           return -1;
11004
11005         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11006         if (count == -1
11007             || !index
11008             || !TYPE_MAX_VALUE (index)
11009             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11010             || !TYPE_MIN_VALUE (index)
11011             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11012             || count < 0)
11013           return -1;
11014
11015         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11016                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11017
11018         /* There must be no padding.  */
11019         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11020           return -1;
11021
11022         return count;
11023       }
11024
11025     case RECORD_TYPE:
11026       {
11027         int count = 0;
11028         int sub_count;
11029         tree field;
11030
11031         /* Can't handle incomplete types nor sizes that are not
11032            fixed.  */
11033         if (!COMPLETE_TYPE_P (type)
11034             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11035           return -1;
11036
11037         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11038           {
11039             if (TREE_CODE (field) != FIELD_DECL)
11040               continue;
11041
11042             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11043             if (sub_count < 0)
11044               return -1;
11045             count += sub_count;
11046           }
11047
11048         /* There must be no padding.  */
11049         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11050           return -1;
11051
11052         return count;
11053       }
11054
11055     case UNION_TYPE:
11056     case QUAL_UNION_TYPE:
11057       {
11058         /* These aren't very interesting except in a degenerate case.  */
11059         int count = 0;
11060         int sub_count;
11061         tree field;
11062
11063         /* Can't handle incomplete types nor sizes that are not
11064            fixed.  */
11065         if (!COMPLETE_TYPE_P (type)
11066             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11067           return -1;
11068
11069         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11070           {
11071             if (TREE_CODE (field) != FIELD_DECL)
11072               continue;
11073
11074             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11075             if (sub_count < 0)
11076               return -1;
11077             count = count > sub_count ? count : sub_count;
11078           }
11079
11080         /* There must be no padding.  */
11081         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11082           return -1;
11083
11084         return count;
11085       }
11086
11087     default:
11088       break;
11089     }
11090
11091   return -1;
11092 }
11093
11094 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11095    type as described in AAPCS64 \S 4.1.2.
11096
11097    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11098
11099 static bool
11100 aarch64_short_vector_p (const_tree type,
11101                         machine_mode mode)
11102 {
11103   HOST_WIDE_INT size = -1;
11104
11105   if (type && TREE_CODE (type) == VECTOR_TYPE)
11106     size = int_size_in_bytes (type);
11107   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11108             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11109     size = GET_MODE_SIZE (mode);
11110
11111   return (size == 8 || size == 16);
11112 }
11113
11114 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11115    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11116    array types.  The C99 floating-point complex types are also considered
11117    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11118    types, which are GCC extensions and out of the scope of AAPCS64, are
11119    treated as composite types here as well.
11120
11121    Note that MODE itself is not sufficient in determining whether a type
11122    is such a composite type or not.  This is because
11123    stor-layout.c:compute_record_mode may have already changed the MODE
11124    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11125    structure with only one field may have its MODE set to the mode of the
11126    field.  Also an integer mode whose size matches the size of the
11127    RECORD_TYPE type may be used to substitute the original mode
11128    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11129    solely relied on.  */
11130
11131 static bool
11132 aarch64_composite_type_p (const_tree type,
11133                           machine_mode mode)
11134 {
11135   if (aarch64_short_vector_p (type, mode))
11136     return false;
11137
11138   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11139     return true;
11140
11141   if (mode == BLKmode
11142       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11143       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11144     return true;
11145
11146   return false;
11147 }
11148
11149 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11150    shall be passed or returned in simd/fp register(s) (providing these
11151    parameter passing registers are available).
11152
11153    Upon successful return, *COUNT returns the number of needed registers,
11154    *BASE_MODE returns the mode of the individual register and when IS_HAF
11155    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11156    floating-point aggregate or a homogeneous short-vector aggregate.  */
11157
11158 static bool
11159 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11160                                          const_tree type,
11161                                          machine_mode *base_mode,
11162                                          int *count,
11163                                          bool *is_ha)
11164 {
11165   machine_mode new_mode = VOIDmode;
11166   bool composite_p = aarch64_composite_type_p (type, mode);
11167
11168   if (is_ha != NULL) *is_ha = false;
11169
11170   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11171       || aarch64_short_vector_p (type, mode))
11172     {
11173       *count = 1;
11174       new_mode = mode;
11175     }
11176   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11177     {
11178       if (is_ha != NULL) *is_ha = true;
11179       *count = 2;
11180       new_mode = GET_MODE_INNER (mode);
11181     }
11182   else if (type && composite_p)
11183     {
11184       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11185
11186       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11187         {
11188           if (is_ha != NULL) *is_ha = true;
11189           *count = ag_count;
11190         }
11191       else
11192         return false;
11193     }
11194   else
11195     return false;
11196
11197   *base_mode = new_mode;
11198   return true;
11199 }
11200
11201 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11202
11203 static rtx
11204 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11205                           int incoming ATTRIBUTE_UNUSED)
11206 {
11207   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11208 }
11209
11210 /* Implements target hook vector_mode_supported_p.  */
11211 static bool
11212 aarch64_vector_mode_supported_p (machine_mode mode)
11213 {
11214   if (TARGET_SIMD
11215       && (mode == V4SImode  || mode == V8HImode
11216           || mode == V16QImode || mode == V2DImode
11217           || mode == V2SImode  || mode == V4HImode
11218           || mode == V8QImode || mode == V2SFmode
11219           || mode == V4SFmode || mode == V2DFmode
11220           || mode == V4HFmode || mode == V8HFmode
11221           || mode == V1DFmode))
11222     return true;
11223
11224   return false;
11225 }
11226
11227 /* Return appropriate SIMD container
11228    for MODE within a vector of WIDTH bits.  */
11229 static machine_mode
11230 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11231 {
11232   gcc_assert (width == 64 || width == 128);
11233   if (TARGET_SIMD)
11234     {
11235       if (width == 128)
11236         switch (mode)
11237           {
11238           case E_DFmode:
11239             return V2DFmode;
11240           case E_SFmode:
11241             return V4SFmode;
11242           case E_HFmode:
11243             return V8HFmode;
11244           case E_SImode:
11245             return V4SImode;
11246           case E_HImode:
11247             return V8HImode;
11248           case E_QImode:
11249             return V16QImode;
11250           case E_DImode:
11251             return V2DImode;
11252           default:
11253             break;
11254           }
11255       else
11256         switch (mode)
11257           {
11258           case E_SFmode:
11259             return V2SFmode;
11260           case E_HFmode:
11261             return V4HFmode;
11262           case E_SImode:
11263             return V2SImode;
11264           case E_HImode:
11265             return V4HImode;
11266           case E_QImode:
11267             return V8QImode;
11268           default:
11269             break;
11270           }
11271     }
11272   return word_mode;
11273 }
11274
11275 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11276 static machine_mode
11277 aarch64_preferred_simd_mode (scalar_mode mode)
11278 {
11279   return aarch64_simd_container_mode (mode, 128);
11280 }
11281
11282 /* Return the bitmask of possible vector sizes for the vectorizer
11283    to iterate over.  */
11284 static unsigned int
11285 aarch64_autovectorize_vector_sizes (void)
11286 {
11287   return (16 | 8);
11288 }
11289
11290 /* Implement TARGET_MANGLE_TYPE.  */
11291
11292 static const char *
11293 aarch64_mangle_type (const_tree type)
11294 {
11295   /* The AArch64 ABI documents say that "__va_list" has to be
11296      managled as if it is in the "std" namespace.  */
11297   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11298     return "St9__va_list";
11299
11300   /* Half-precision float.  */
11301   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11302     return "Dh";
11303
11304   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11305      builtin types.  */
11306   if (TYPE_NAME (type) != NULL)
11307     return aarch64_mangle_builtin_type (type);
11308
11309   /* Use the default mangling.  */
11310   return NULL;
11311 }
11312
11313 /* Find the first rtx_insn before insn that will generate an assembly
11314    instruction.  */
11315
11316 static rtx_insn *
11317 aarch64_prev_real_insn (rtx_insn *insn)
11318 {
11319   if (!insn)
11320     return NULL;
11321
11322   do
11323     {
11324       insn = prev_real_insn (insn);
11325     }
11326   while (insn && recog_memoized (insn) < 0);
11327
11328   return insn;
11329 }
11330
11331 static bool
11332 is_madd_op (enum attr_type t1)
11333 {
11334   unsigned int i;
11335   /* A number of these may be AArch32 only.  */
11336   enum attr_type mlatypes[] = {
11337     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11338     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11339     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11340   };
11341
11342   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11343     {
11344       if (t1 == mlatypes[i])
11345         return true;
11346     }
11347
11348   return false;
11349 }
11350
11351 /* Check if there is a register dependency between a load and the insn
11352    for which we hold recog_data.  */
11353
11354 static bool
11355 dep_between_memop_and_curr (rtx memop)
11356 {
11357   rtx load_reg;
11358   int opno;
11359
11360   gcc_assert (GET_CODE (memop) == SET);
11361
11362   if (!REG_P (SET_DEST (memop)))
11363     return false;
11364
11365   load_reg = SET_DEST (memop);
11366   for (opno = 1; opno < recog_data.n_operands; opno++)
11367     {
11368       rtx operand = recog_data.operand[opno];
11369       if (REG_P (operand)
11370           && reg_overlap_mentioned_p (load_reg, operand))
11371         return true;
11372
11373     }
11374   return false;
11375 }
11376
11377
11378 /* When working around the Cortex-A53 erratum 835769,
11379    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11380    instruction and has a preceding memory instruction such that a NOP
11381    should be inserted between them.  */
11382
11383 bool
11384 aarch64_madd_needs_nop (rtx_insn* insn)
11385 {
11386   enum attr_type attr_type;
11387   rtx_insn *prev;
11388   rtx body;
11389
11390   if (!TARGET_FIX_ERR_A53_835769)
11391     return false;
11392
11393   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11394     return false;
11395
11396   attr_type = get_attr_type (insn);
11397   if (!is_madd_op (attr_type))
11398     return false;
11399
11400   prev = aarch64_prev_real_insn (insn);
11401   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11402      Restore recog state to INSN to avoid state corruption.  */
11403   extract_constrain_insn_cached (insn);
11404
11405   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11406     return false;
11407
11408   body = single_set (prev);
11409
11410   /* If the previous insn is a memory op and there is no dependency between
11411      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11412      have a complex memory operation, probably a load/store pair.
11413      Be conservative for now and emit a NOP.  */
11414   if (GET_MODE (recog_data.operand[0]) == DImode
11415       && (!body || !dep_between_memop_and_curr (body)))
11416     return true;
11417
11418   return false;
11419
11420 }
11421
11422
11423 /* Implement FINAL_PRESCAN_INSN.  */
11424
11425 void
11426 aarch64_final_prescan_insn (rtx_insn *insn)
11427 {
11428   if (aarch64_madd_needs_nop (insn))
11429     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11430 }
11431
11432
11433 /* Return the equivalent letter for size.  */
11434 static char
11435 sizetochar (int size)
11436 {
11437   switch (size)
11438     {
11439     case 64: return 'd';
11440     case 32: return 's';
11441     case 16: return 'h';
11442     case 8 : return 'b';
11443     default: gcc_unreachable ();
11444     }
11445 }
11446
11447 /* Return true iff x is a uniform vector of floating-point
11448    constants, and the constant can be represented in
11449    quarter-precision form.  Note, as aarch64_float_const_representable
11450    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11451 static bool
11452 aarch64_vect_float_const_representable_p (rtx x)
11453 {
11454   rtx elt;
11455   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11456           && const_vec_duplicate_p (x, &elt)
11457           && aarch64_float_const_representable_p (elt));
11458 }
11459
11460 /* Return true for valid and false for invalid.  */
11461 bool
11462 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11463                               struct simd_immediate_info *info)
11464 {
11465 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11466   matches = 1;                                          \
11467   for (i = 0; i < idx; i += (STRIDE))                   \
11468     if (!(TEST))                                        \
11469       matches = 0;                                      \
11470   if (matches)                                          \
11471     {                                                   \
11472       immtype = (CLASS);                                \
11473       elsize = (ELSIZE);                                \
11474       eshift = (SHIFT);                                 \
11475       emvn = (NEG);                                     \
11476       break;                                            \
11477     }
11478
11479   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11480   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11481   unsigned char bytes[16];
11482   int immtype = -1, matches;
11483   unsigned int invmask = inverse ? 0xff : 0;
11484   int eshift, emvn;
11485
11486   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11487     {
11488       if (! (aarch64_simd_imm_zero_p (op, mode)
11489              || aarch64_vect_float_const_representable_p (op)))
11490         return false;
11491
11492       if (info)
11493         {
11494           rtx elt = CONST_VECTOR_ELT (op, 0);
11495           scalar_float_mode elt_mode
11496             = as_a <scalar_float_mode> (GET_MODE (elt));
11497
11498           info->value = elt;
11499           info->element_width = GET_MODE_BITSIZE (elt_mode);
11500           info->mvn = false;
11501           info->shift = 0;
11502         }
11503
11504       return true;
11505     }
11506
11507   /* Splat vector constant out into a byte vector.  */
11508   for (i = 0; i < n_elts; i++)
11509     {
11510       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11511          it must be laid out in the vector register in reverse order.  */
11512       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11513       unsigned HOST_WIDE_INT elpart;
11514
11515       gcc_assert (CONST_INT_P (el));
11516       elpart = INTVAL (el);
11517
11518       for (unsigned int byte = 0; byte < innersize; byte++)
11519         {
11520           bytes[idx++] = (elpart & 0xff) ^ invmask;
11521           elpart >>= BITS_PER_UNIT;
11522         }
11523
11524     }
11525
11526   /* Sanity check.  */
11527   gcc_assert (idx == GET_MODE_SIZE (mode));
11528
11529   do
11530     {
11531       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11532              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11533
11534       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11535              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11536
11537       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11538              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11539
11540       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11541              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11542
11543       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11544
11545       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11546
11547       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11548              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11549
11550       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11551              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11552
11553       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11554              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11555
11556       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11557              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11558
11559       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11560
11561       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11562
11563       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11564              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11565
11566       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11567              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11568
11569       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11570              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11571
11572       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11573              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11574
11575       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11576
11577       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11578              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11579     }
11580   while (0);
11581
11582   if (immtype == -1)
11583     return false;
11584
11585   if (info)
11586     {
11587       info->element_width = elsize;
11588       info->mvn = emvn != 0;
11589       info->shift = eshift;
11590
11591       unsigned HOST_WIDE_INT imm = 0;
11592
11593       if (immtype >= 12 && immtype <= 15)
11594         info->msl = true;
11595
11596       /* Un-invert bytes of recognized vector, if necessary.  */
11597       if (invmask != 0)
11598         for (i = 0; i < idx; i++)
11599           bytes[i] ^= invmask;
11600
11601       if (immtype == 17)
11602         {
11603           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11604           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11605
11606           for (i = 0; i < 8; i++)
11607             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11608               << (i * BITS_PER_UNIT);
11609
11610
11611           info->value = GEN_INT (imm);
11612         }
11613       else
11614         {
11615           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11616             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11617
11618           /* Construct 'abcdefgh' because the assembler cannot handle
11619              generic constants.  */
11620           if (info->mvn)
11621             imm = ~imm;
11622           imm = (imm >> info->shift) & 0xff;
11623           info->value = GEN_INT (imm);
11624         }
11625     }
11626
11627   return true;
11628 #undef CHECK
11629 }
11630
11631 /* Check of immediate shift constants are within range.  */
11632 bool
11633 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11634 {
11635   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11636   if (left)
11637     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11638   else
11639     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11640 }
11641
11642 /* Return true if X is a uniform vector where all elements
11643    are either the floating-point constant 0.0 or the
11644    integer constant 0.  */
11645 bool
11646 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11647 {
11648   return x == CONST0_RTX (mode);
11649 }
11650
11651
11652 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11653    operation of width WIDTH at bit position POS.  */
11654
11655 rtx
11656 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11657 {
11658   gcc_assert (CONST_INT_P (width));
11659   gcc_assert (CONST_INT_P (pos));
11660
11661   unsigned HOST_WIDE_INT mask
11662     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11663   return GEN_INT (mask << UINTVAL (pos));
11664 }
11665
11666 bool
11667 aarch64_mov_operand_p (rtx x, machine_mode mode)
11668 {
11669   if (GET_CODE (x) == HIGH
11670       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11671     return true;
11672
11673   if (CONST_INT_P (x))
11674     return true;
11675
11676   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11677     return true;
11678
11679   return aarch64_classify_symbolic_expression (x)
11680     == SYMBOL_TINY_ABSOLUTE;
11681 }
11682
11683 /* Return a const_int vector of VAL.  */
11684 rtx
11685 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11686 {
11687   int nunits = GET_MODE_NUNITS (mode);
11688   rtvec v = rtvec_alloc (nunits);
11689   int i;
11690
11691   rtx cache = GEN_INT (val);
11692
11693   for (i=0; i < nunits; i++)
11694     RTVEC_ELT (v, i) = cache;
11695
11696   return gen_rtx_CONST_VECTOR (mode, v);
11697 }
11698
11699 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11700
11701 bool
11702 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11703 {
11704   machine_mode vmode;
11705
11706   gcc_assert (!VECTOR_MODE_P (mode));
11707   vmode = aarch64_preferred_simd_mode (as_a <scalar_mode> (mode));
11708   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11709   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11710 }
11711
11712 /* Construct and return a PARALLEL RTX vector with elements numbering the
11713    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11714    the vector - from the perspective of the architecture.  This does not
11715    line up with GCC's perspective on lane numbers, so we end up with
11716    different masks depending on our target endian-ness.  The diagram
11717    below may help.  We must draw the distinction when building masks
11718    which select one half of the vector.  An instruction selecting
11719    architectural low-lanes for a big-endian target, must be described using
11720    a mask selecting GCC high-lanes.
11721
11722                  Big-Endian             Little-Endian
11723
11724 GCC             0   1   2   3           3   2   1   0
11725               | x | x | x | x |       | x | x | x | x |
11726 Architecture    3   2   1   0           3   2   1   0
11727
11728 Low Mask:         { 2, 3 }                { 0, 1 }
11729 High Mask:        { 0, 1 }                { 2, 3 }
11730 */
11731
11732 rtx
11733 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11734 {
11735   int nunits = GET_MODE_NUNITS (mode);
11736   rtvec v = rtvec_alloc (nunits / 2);
11737   int high_base = nunits / 2;
11738   int low_base = 0;
11739   int base;
11740   rtx t1;
11741   int i;
11742
11743   if (BYTES_BIG_ENDIAN)
11744     base = high ? low_base : high_base;
11745   else
11746     base = high ? high_base : low_base;
11747
11748   for (i = 0; i < nunits / 2; i++)
11749     RTVEC_ELT (v, i) = GEN_INT (base + i);
11750
11751   t1 = gen_rtx_PARALLEL (mode, v);
11752   return t1;
11753 }
11754
11755 /* Check OP for validity as a PARALLEL RTX vector with elements
11756    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11757    from the perspective of the architecture.  See the diagram above
11758    aarch64_simd_vect_par_cnst_half for more details.  */
11759
11760 bool
11761 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11762                                        bool high)
11763 {
11764   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11765   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11766   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11767   int i = 0;
11768
11769   if (!VECTOR_MODE_P (mode))
11770     return false;
11771
11772   if (count_op != count_ideal)
11773     return false;
11774
11775   for (i = 0; i < count_ideal; i++)
11776     {
11777       rtx elt_op = XVECEXP (op, 0, i);
11778       rtx elt_ideal = XVECEXP (ideal, 0, i);
11779
11780       if (!CONST_INT_P (elt_op)
11781           || INTVAL (elt_ideal) != INTVAL (elt_op))
11782         return false;
11783     }
11784   return true;
11785 }
11786
11787 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11788    HIGH (exclusive).  */
11789 void
11790 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11791                           const_tree exp)
11792 {
11793   HOST_WIDE_INT lane;
11794   gcc_assert (CONST_INT_P (operand));
11795   lane = INTVAL (operand);
11796
11797   if (lane < low || lane >= high)
11798   {
11799     if (exp)
11800       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11801     else
11802       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11803   }
11804 }
11805
11806 /* Return TRUE if OP is a valid vector addressing mode.  */
11807 bool
11808 aarch64_simd_mem_operand_p (rtx op)
11809 {
11810   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11811                         || REG_P (XEXP (op, 0)));
11812 }
11813
11814 /* Emit a register copy from operand to operand, taking care not to
11815    early-clobber source registers in the process.
11816
11817    COUNT is the number of components into which the copy needs to be
11818    decomposed.  */
11819 void
11820 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11821                                 unsigned int count)
11822 {
11823   unsigned int i;
11824   int rdest = REGNO (operands[0]);
11825   int rsrc = REGNO (operands[1]);
11826
11827   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11828       || rdest < rsrc)
11829     for (i = 0; i < count; i++)
11830       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11831                       gen_rtx_REG (mode, rsrc + i));
11832   else
11833     for (i = 0; i < count; i++)
11834       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11835                       gen_rtx_REG (mode, rsrc + count - i - 1));
11836 }
11837
11838 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11839    one of VSTRUCT modes: OI, CI, or XI.  */
11840 int
11841 aarch64_simd_attr_length_rglist (machine_mode mode)
11842 {
11843   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11844 }
11845
11846 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11847    alignment of a vector to 128 bits.  */
11848 static HOST_WIDE_INT
11849 aarch64_simd_vector_alignment (const_tree type)
11850 {
11851   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11852   return MIN (align, 128);
11853 }
11854
11855 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11856 static bool
11857 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11858 {
11859   if (is_packed)
11860     return false;
11861
11862   /* We guarantee alignment for vectors up to 128-bits.  */
11863   if (tree_int_cst_compare (TYPE_SIZE (type),
11864                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11865     return false;
11866
11867   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11868   return true;
11869 }
11870
11871 /* Return true if the vector misalignment factor is supported by the
11872    target.  */
11873 static bool
11874 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11875                                              const_tree type, int misalignment,
11876                                              bool is_packed)
11877 {
11878   if (TARGET_SIMD && STRICT_ALIGNMENT)
11879     {
11880       /* Return if movmisalign pattern is not supported for this mode.  */
11881       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11882         return false;
11883
11884       if (misalignment == -1)
11885         {
11886           /* Misalignment factor is unknown at compile time but we know
11887              it's word aligned.  */
11888           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11889             {
11890               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11891
11892               if (element_size != 64)
11893                 return true;
11894             }
11895           return false;
11896         }
11897     }
11898   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11899                                                       is_packed);
11900 }
11901
11902 /* If VALS is a vector constant that can be loaded into a register
11903    using DUP, generate instructions to do so and return an RTX to
11904    assign to the register.  Otherwise return NULL_RTX.  */
11905 static rtx
11906 aarch64_simd_dup_constant (rtx vals)
11907 {
11908   machine_mode mode = GET_MODE (vals);
11909   machine_mode inner_mode = GET_MODE_INNER (mode);
11910   rtx x;
11911
11912   if (!const_vec_duplicate_p (vals, &x))
11913     return NULL_RTX;
11914
11915   /* We can load this constant by using DUP and a constant in a
11916      single ARM register.  This will be cheaper than a vector
11917      load.  */
11918   x = copy_to_mode_reg (inner_mode, x);
11919   return gen_rtx_VEC_DUPLICATE (mode, x);
11920 }
11921
11922
11923 /* Generate code to load VALS, which is a PARALLEL containing only
11924    constants (for vec_init) or CONST_VECTOR, efficiently into a
11925    register.  Returns an RTX to copy into the register, or NULL_RTX
11926    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11927 static rtx
11928 aarch64_simd_make_constant (rtx vals)
11929 {
11930   machine_mode mode = GET_MODE (vals);
11931   rtx const_dup;
11932   rtx const_vec = NULL_RTX;
11933   int n_elts = GET_MODE_NUNITS (mode);
11934   int n_const = 0;
11935   int i;
11936
11937   if (GET_CODE (vals) == CONST_VECTOR)
11938     const_vec = vals;
11939   else if (GET_CODE (vals) == PARALLEL)
11940     {
11941       /* A CONST_VECTOR must contain only CONST_INTs and
11942          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11943          Only store valid constants in a CONST_VECTOR.  */
11944       for (i = 0; i < n_elts; ++i)
11945         {
11946           rtx x = XVECEXP (vals, 0, i);
11947           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11948             n_const++;
11949         }
11950       if (n_const == n_elts)
11951         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11952     }
11953   else
11954     gcc_unreachable ();
11955
11956   if (const_vec != NULL_RTX
11957       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11958     /* Load using MOVI/MVNI.  */
11959     return const_vec;
11960   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11961     /* Loaded using DUP.  */
11962     return const_dup;
11963   else if (const_vec != NULL_RTX)
11964     /* Load from constant pool. We can not take advantage of single-cycle
11965        LD1 because we need a PC-relative addressing mode.  */
11966     return const_vec;
11967   else
11968     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11969        We can not construct an initializer.  */
11970     return NULL_RTX;
11971 }
11972
11973 /* Expand a vector initialisation sequence, such that TARGET is
11974    initialised to contain VALS.  */
11975
11976 void
11977 aarch64_expand_vector_init (rtx target, rtx vals)
11978 {
11979   machine_mode mode = GET_MODE (target);
11980   machine_mode inner_mode = GET_MODE_INNER (mode);
11981   /* The number of vector elements.  */
11982   int n_elts = GET_MODE_NUNITS (mode);
11983   /* The number of vector elements which are not constant.  */
11984   int n_var = 0;
11985   rtx any_const = NULL_RTX;
11986   /* The first element of vals.  */
11987   rtx v0 = XVECEXP (vals, 0, 0);
11988   bool all_same = true;
11989
11990   /* Count the number of variable elements to initialise.  */
11991   for (int i = 0; i < n_elts; ++i)
11992     {
11993       rtx x = XVECEXP (vals, 0, i);
11994       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11995         ++n_var;
11996       else
11997         any_const = x;
11998
11999       all_same &= rtx_equal_p (x, v0);
12000     }
12001
12002   /* No variable elements, hand off to aarch64_simd_make_constant which knows
12003      how best to handle this.  */
12004   if (n_var == 0)
12005     {
12006       rtx constant = aarch64_simd_make_constant (vals);
12007       if (constant != NULL_RTX)
12008         {
12009           emit_move_insn (target, constant);
12010           return;
12011         }
12012     }
12013
12014   /* Splat a single non-constant element if we can.  */
12015   if (all_same)
12016     {
12017       rtx x = copy_to_mode_reg (inner_mode, v0);
12018       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12019       return;
12020     }
12021
12022   enum insn_code icode = optab_handler (vec_set_optab, mode);
12023   gcc_assert (icode != CODE_FOR_nothing);
12024
12025   /* If there are only variable elements, try to optimize
12026      the insertion using dup for the most common element
12027      followed by insertions.  */
12028
12029   /* The algorithm will fill matches[*][0] with the earliest matching element,
12030      and matches[X][1] with the count of duplicate elements (if X is the
12031      earliest element which has duplicates).  */
12032
12033   if (n_var == n_elts && n_elts <= 16)
12034     {
12035       int matches[16][2] = {0};
12036       for (int i = 0; i < n_elts; i++)
12037         {
12038           for (int j = 0; j <= i; j++)
12039             {
12040               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12041                 {
12042                   matches[i][0] = j;
12043                   matches[j][1]++;
12044                   break;
12045                 }
12046             }
12047         }
12048       int maxelement = 0;
12049       int maxv = 0;
12050       for (int i = 0; i < n_elts; i++)
12051         if (matches[i][1] > maxv)
12052           {
12053             maxelement = i;
12054             maxv = matches[i][1];
12055           }
12056
12057       /* Create a duplicate of the most common element.  */
12058       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12059       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12060
12061       /* Insert the rest.  */
12062       for (int i = 0; i < n_elts; i++)
12063         {
12064           rtx x = XVECEXP (vals, 0, i);
12065           if (matches[i][0] == maxelement)
12066             continue;
12067           x = copy_to_mode_reg (inner_mode, x);
12068           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12069         }
12070       return;
12071     }
12072
12073   /* Initialise a vector which is part-variable.  We want to first try
12074      to build those lanes which are constant in the most efficient way we
12075      can.  */
12076   if (n_var != n_elts)
12077     {
12078       rtx copy = copy_rtx (vals);
12079
12080       /* Load constant part of vector.  We really don't care what goes into the
12081          parts we will overwrite, but we're more likely to be able to load the
12082          constant efficiently if it has fewer, larger, repeating parts
12083          (see aarch64_simd_valid_immediate).  */
12084       for (int i = 0; i < n_elts; i++)
12085         {
12086           rtx x = XVECEXP (vals, 0, i);
12087           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12088             continue;
12089           rtx subst = any_const;
12090           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12091             {
12092               /* Look in the copied vector, as more elements are const.  */
12093               rtx test = XVECEXP (copy, 0, i ^ bit);
12094               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12095                 {
12096                   subst = test;
12097                   break;
12098                 }
12099             }
12100           XVECEXP (copy, 0, i) = subst;
12101         }
12102       aarch64_expand_vector_init (target, copy);
12103     }
12104
12105   /* Insert the variable lanes directly.  */
12106   for (int i = 0; i < n_elts; i++)
12107     {
12108       rtx x = XVECEXP (vals, 0, i);
12109       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12110         continue;
12111       x = copy_to_mode_reg (inner_mode, x);
12112       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12113     }
12114 }
12115
12116 static unsigned HOST_WIDE_INT
12117 aarch64_shift_truncation_mask (machine_mode mode)
12118 {
12119   return
12120     (!SHIFT_COUNT_TRUNCATED
12121      || aarch64_vector_mode_supported_p (mode)
12122      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12123 }
12124
12125 /* Select a format to encode pointers in exception handling data.  */
12126 int
12127 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12128 {
12129    int type;
12130    switch (aarch64_cmodel)
12131      {
12132      case AARCH64_CMODEL_TINY:
12133      case AARCH64_CMODEL_TINY_PIC:
12134      case AARCH64_CMODEL_SMALL:
12135      case AARCH64_CMODEL_SMALL_PIC:
12136      case AARCH64_CMODEL_SMALL_SPIC:
12137        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12138           for everything.  */
12139        type = DW_EH_PE_sdata4;
12140        break;
12141      default:
12142        /* No assumptions here.  8-byte relocs required.  */
12143        type = DW_EH_PE_sdata8;
12144        break;
12145      }
12146    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12147 }
12148
12149 /* The last .arch and .tune assembly strings that we printed.  */
12150 static std::string aarch64_last_printed_arch_string;
12151 static std::string aarch64_last_printed_tune_string;
12152
12153 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12154    by the function fndecl.  */
12155
12156 void
12157 aarch64_declare_function_name (FILE *stream, const char* name,
12158                                 tree fndecl)
12159 {
12160   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12161
12162   struct cl_target_option *targ_options;
12163   if (target_parts)
12164     targ_options = TREE_TARGET_OPTION (target_parts);
12165   else
12166     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12167   gcc_assert (targ_options);
12168
12169   const struct processor *this_arch
12170     = aarch64_get_arch (targ_options->x_explicit_arch);
12171
12172   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12173   std::string extension
12174     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12175                                                   this_arch->flags);
12176   /* Only update the assembler .arch string if it is distinct from the last
12177      such string we printed.  */
12178   std::string to_print = this_arch->name + extension;
12179   if (to_print != aarch64_last_printed_arch_string)
12180     {
12181       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12182       aarch64_last_printed_arch_string = to_print;
12183     }
12184
12185   /* Print the cpu name we're tuning for in the comments, might be
12186      useful to readers of the generated asm.  Do it only when it changes
12187      from function to function and verbose assembly is requested.  */
12188   const struct processor *this_tune
12189     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12190
12191   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12192     {
12193       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12194                    this_tune->name);
12195       aarch64_last_printed_tune_string = this_tune->name;
12196     }
12197
12198   /* Don't forget the type directive for ELF.  */
12199   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12200   ASM_OUTPUT_LABEL (stream, name);
12201 }
12202
12203 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12204
12205 static void
12206 aarch64_start_file (void)
12207 {
12208   struct cl_target_option *default_options
12209     = TREE_TARGET_OPTION (target_option_default_node);
12210
12211   const struct processor *default_arch
12212     = aarch64_get_arch (default_options->x_explicit_arch);
12213   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12214   std::string extension
12215     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12216                                                   default_arch->flags);
12217
12218    aarch64_last_printed_arch_string = default_arch->name + extension;
12219    aarch64_last_printed_tune_string = "";
12220    asm_fprintf (asm_out_file, "\t.arch %s\n",
12221                 aarch64_last_printed_arch_string.c_str ());
12222
12223    default_file_start ();
12224 }
12225
12226 /* Emit load exclusive.  */
12227
12228 static void
12229 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12230                              rtx mem, rtx model_rtx)
12231 {
12232   rtx (*gen) (rtx, rtx, rtx);
12233
12234   switch (mode)
12235     {
12236     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12237     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12238     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12239     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12240     default:
12241       gcc_unreachable ();
12242     }
12243
12244   emit_insn (gen (rval, mem, model_rtx));
12245 }
12246
12247 /* Emit store exclusive.  */
12248
12249 static void
12250 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12251                               rtx rval, rtx mem, rtx model_rtx)
12252 {
12253   rtx (*gen) (rtx, rtx, rtx, rtx);
12254
12255   switch (mode)
12256     {
12257     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12258     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12259     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12260     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12261     default:
12262       gcc_unreachable ();
12263     }
12264
12265   emit_insn (gen (bval, rval, mem, model_rtx));
12266 }
12267
12268 /* Mark the previous jump instruction as unlikely.  */
12269
12270 static void
12271 aarch64_emit_unlikely_jump (rtx insn)
12272 {
12273   rtx_insn *jump = emit_jump_insn (insn);
12274   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12275 }
12276
12277 /* Expand a compare and swap pattern.  */
12278
12279 void
12280 aarch64_expand_compare_and_swap (rtx operands[])
12281 {
12282   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12283   machine_mode mode, cmp_mode;
12284   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12285   int idx;
12286   gen_cas_fn gen;
12287   const gen_cas_fn split_cas[] =
12288   {
12289     gen_aarch64_compare_and_swapqi,
12290     gen_aarch64_compare_and_swaphi,
12291     gen_aarch64_compare_and_swapsi,
12292     gen_aarch64_compare_and_swapdi
12293   };
12294   const gen_cas_fn atomic_cas[] =
12295   {
12296     gen_aarch64_compare_and_swapqi_lse,
12297     gen_aarch64_compare_and_swaphi_lse,
12298     gen_aarch64_compare_and_swapsi_lse,
12299     gen_aarch64_compare_and_swapdi_lse
12300   };
12301
12302   bval = operands[0];
12303   rval = operands[1];
12304   mem = operands[2];
12305   oldval = operands[3];
12306   newval = operands[4];
12307   is_weak = operands[5];
12308   mod_s = operands[6];
12309   mod_f = operands[7];
12310   mode = GET_MODE (mem);
12311   cmp_mode = mode;
12312
12313   /* Normally the succ memory model must be stronger than fail, but in the
12314      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12315      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12316
12317   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12318       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12319     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12320
12321   switch (mode)
12322     {
12323     case E_QImode:
12324     case E_HImode:
12325       /* For short modes, we're going to perform the comparison in SImode,
12326          so do the zero-extension now.  */
12327       cmp_mode = SImode;
12328       rval = gen_reg_rtx (SImode);
12329       oldval = convert_modes (SImode, mode, oldval, true);
12330       /* Fall through.  */
12331
12332     case E_SImode:
12333     case E_DImode:
12334       /* Force the value into a register if needed.  */
12335       if (!aarch64_plus_operand (oldval, mode))
12336         oldval = force_reg (cmp_mode, oldval);
12337       break;
12338
12339     default:
12340       gcc_unreachable ();
12341     }
12342
12343   switch (mode)
12344     {
12345     case E_QImode: idx = 0; break;
12346     case E_HImode: idx = 1; break;
12347     case E_SImode: idx = 2; break;
12348     case E_DImode: idx = 3; break;
12349     default:
12350       gcc_unreachable ();
12351     }
12352   if (TARGET_LSE)
12353     gen = atomic_cas[idx];
12354   else
12355     gen = split_cas[idx];
12356
12357   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12358
12359   if (mode == QImode || mode == HImode)
12360     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12361
12362   x = gen_rtx_REG (CCmode, CC_REGNUM);
12363   x = gen_rtx_EQ (SImode, x, const0_rtx);
12364   emit_insn (gen_rtx_SET (bval, x));
12365 }
12366
12367 /* Test whether the target supports using a atomic load-operate instruction.
12368    CODE is the operation and AFTER is TRUE if the data in memory after the
12369    operation should be returned and FALSE if the data before the operation
12370    should be returned.  Returns FALSE if the operation isn't supported by the
12371    architecture.  */
12372
12373 bool
12374 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12375 {
12376   if (!TARGET_LSE)
12377     return false;
12378
12379   switch (code)
12380     {
12381     case SET:
12382     case AND:
12383     case IOR:
12384     case XOR:
12385     case MINUS:
12386     case PLUS:
12387       return true;
12388     default:
12389       return false;
12390     }
12391 }
12392
12393 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12394    sequence implementing an atomic operation.  */
12395
12396 static void
12397 aarch64_emit_post_barrier (enum memmodel model)
12398 {
12399   const enum memmodel base_model = memmodel_base (model);
12400
12401   if (is_mm_sync (model)
12402       && (base_model == MEMMODEL_ACQUIRE
12403           || base_model == MEMMODEL_ACQ_REL
12404           || base_model == MEMMODEL_SEQ_CST))
12405     {
12406       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12407     }
12408 }
12409
12410 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12411    for the data in memory.  EXPECTED is the value expected to be in memory.
12412    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12413    is the memory ordering to use.  */
12414
12415 void
12416 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12417                         rtx expected, rtx desired,
12418                         rtx model)
12419 {
12420   rtx (*gen) (rtx, rtx, rtx, rtx);
12421   machine_mode mode;
12422
12423   mode = GET_MODE (mem);
12424
12425   switch (mode)
12426     {
12427     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12428     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12429     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12430     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12431     default:
12432       gcc_unreachable ();
12433     }
12434
12435   /* Move the expected value into the CAS destination register.  */
12436   emit_insn (gen_rtx_SET (rval, expected));
12437
12438   /* Emit the CAS.  */
12439   emit_insn (gen (rval, mem, desired, model));
12440
12441   /* Compare the expected value with the value loaded by the CAS, to establish
12442      whether the swap was made.  */
12443   aarch64_gen_compare_reg (EQ, rval, expected);
12444 }
12445
12446 /* Split a compare and swap pattern.  */
12447
12448 void
12449 aarch64_split_compare_and_swap (rtx operands[])
12450 {
12451   rtx rval, mem, oldval, newval, scratch;
12452   machine_mode mode;
12453   bool is_weak;
12454   rtx_code_label *label1, *label2;
12455   rtx x, cond;
12456   enum memmodel model;
12457   rtx model_rtx;
12458
12459   rval = operands[0];
12460   mem = operands[1];
12461   oldval = operands[2];
12462   newval = operands[3];
12463   is_weak = (operands[4] != const0_rtx);
12464   model_rtx = operands[5];
12465   scratch = operands[7];
12466   mode = GET_MODE (mem);
12467   model = memmodel_from_int (INTVAL (model_rtx));
12468
12469   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12470     loop:
12471     .label1:
12472         LD[A]XR rval, [mem]
12473         CBNZ    rval, .label2
12474         ST[L]XR scratch, newval, [mem]
12475         CBNZ    scratch, .label1
12476     .label2:
12477         CMP     rval, 0.  */
12478   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12479
12480   label1 = NULL;
12481   if (!is_weak)
12482     {
12483       label1 = gen_label_rtx ();
12484       emit_label (label1);
12485     }
12486   label2 = gen_label_rtx ();
12487
12488   /* The initial load can be relaxed for a __sync operation since a final
12489      barrier will be emitted to stop code hoisting.  */
12490   if (is_mm_sync (model))
12491     aarch64_emit_load_exclusive (mode, rval, mem,
12492                                  GEN_INT (MEMMODEL_RELAXED));
12493   else
12494     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12495
12496   if (strong_zero_p)
12497     {
12498       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12499       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12500                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12501       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12502     }
12503   else
12504     {
12505       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12506       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12507       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12508                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12509       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12510     }
12511
12512   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12513
12514   if (!is_weak)
12515     {
12516       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12517       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12518                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12519       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12520     }
12521   else
12522     {
12523       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12524       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12525       emit_insn (gen_rtx_SET (cond, x));
12526     }
12527
12528   emit_label (label2);
12529   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12530      to set the condition flags.  If this is not used it will be removed by
12531      later passes.  */
12532   if (strong_zero_p)
12533     {
12534       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12535       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12536       emit_insn (gen_rtx_SET (cond, x));
12537     }
12538   /* Emit any final barrier needed for a __sync operation.  */
12539   if (is_mm_sync (model))
12540     aarch64_emit_post_barrier (model);
12541 }
12542
12543 /* Emit a BIC instruction.  */
12544
12545 static void
12546 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12547 {
12548   rtx shift_rtx = GEN_INT (shift);
12549   rtx (*gen) (rtx, rtx, rtx, rtx);
12550
12551   switch (mode)
12552     {
12553     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12554     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12555     default:
12556       gcc_unreachable ();
12557     }
12558
12559   emit_insn (gen (dst, s2, shift_rtx, s1));
12560 }
12561
12562 /* Emit an atomic swap.  */
12563
12564 static void
12565 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12566                           rtx mem, rtx model)
12567 {
12568   rtx (*gen) (rtx, rtx, rtx, rtx);
12569
12570   switch (mode)
12571     {
12572     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12573     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12574     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12575     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12576     default:
12577       gcc_unreachable ();
12578     }
12579
12580   emit_insn (gen (dst, mem, value, model));
12581 }
12582
12583 /* Operations supported by aarch64_emit_atomic_load_op.  */
12584
12585 enum aarch64_atomic_load_op_code
12586 {
12587   AARCH64_LDOP_PLUS,    /* A + B  */
12588   AARCH64_LDOP_XOR,     /* A ^ B  */
12589   AARCH64_LDOP_OR,      /* A | B  */
12590   AARCH64_LDOP_BIC      /* A & ~B  */
12591 };
12592
12593 /* Emit an atomic load-operate.  */
12594
12595 static void
12596 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12597                              machine_mode mode, rtx dst, rtx src,
12598                              rtx mem, rtx model)
12599 {
12600   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12601   const aarch64_atomic_load_op_fn plus[] =
12602   {
12603     gen_aarch64_atomic_loadaddqi,
12604     gen_aarch64_atomic_loadaddhi,
12605     gen_aarch64_atomic_loadaddsi,
12606     gen_aarch64_atomic_loadadddi
12607   };
12608   const aarch64_atomic_load_op_fn eor[] =
12609   {
12610     gen_aarch64_atomic_loadeorqi,
12611     gen_aarch64_atomic_loadeorhi,
12612     gen_aarch64_atomic_loadeorsi,
12613     gen_aarch64_atomic_loadeordi
12614   };
12615   const aarch64_atomic_load_op_fn ior[] =
12616   {
12617     gen_aarch64_atomic_loadsetqi,
12618     gen_aarch64_atomic_loadsethi,
12619     gen_aarch64_atomic_loadsetsi,
12620     gen_aarch64_atomic_loadsetdi
12621   };
12622   const aarch64_atomic_load_op_fn bic[] =
12623   {
12624     gen_aarch64_atomic_loadclrqi,
12625     gen_aarch64_atomic_loadclrhi,
12626     gen_aarch64_atomic_loadclrsi,
12627     gen_aarch64_atomic_loadclrdi
12628   };
12629   aarch64_atomic_load_op_fn gen;
12630   int idx = 0;
12631
12632   switch (mode)
12633     {
12634     case E_QImode: idx = 0; break;
12635     case E_HImode: idx = 1; break;
12636     case E_SImode: idx = 2; break;
12637     case E_DImode: idx = 3; break;
12638     default:
12639       gcc_unreachable ();
12640     }
12641
12642   switch (code)
12643     {
12644     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12645     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12646     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12647     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12648     default:
12649       gcc_unreachable ();
12650     }
12651
12652   emit_insn (gen (dst, mem, src, model));
12653 }
12654
12655 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12656    location to store the data read from memory.  OUT_RESULT is the location to
12657    store the result of the operation.  MEM is the memory location to read and
12658    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12659    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12660    be NULL.  */
12661
12662 void
12663 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12664                          rtx mem, rtx value, rtx model_rtx)
12665 {
12666   machine_mode mode = GET_MODE (mem);
12667   machine_mode wmode = (mode == DImode ? DImode : SImode);
12668   const bool short_mode = (mode < SImode);
12669   aarch64_atomic_load_op_code ldop_code;
12670   rtx src;
12671   rtx x;
12672
12673   if (out_data)
12674     out_data = gen_lowpart (mode, out_data);
12675
12676   if (out_result)
12677     out_result = gen_lowpart (mode, out_result);
12678
12679   /* Make sure the value is in a register, putting it into a destination
12680      register if it needs to be manipulated.  */
12681   if (!register_operand (value, mode)
12682       || code == AND || code == MINUS)
12683     {
12684       src = out_result ? out_result : out_data;
12685       emit_move_insn (src, gen_lowpart (mode, value));
12686     }
12687   else
12688     src = value;
12689   gcc_assert (register_operand (src, mode));
12690
12691   /* Preprocess the data for the operation as necessary.  If the operation is
12692      a SET then emit a swap instruction and finish.  */
12693   switch (code)
12694     {
12695     case SET:
12696       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12697       return;
12698
12699     case MINUS:
12700       /* Negate the value and treat it as a PLUS.  */
12701       {
12702         rtx neg_src;
12703
12704         /* Resize the value if necessary.  */
12705         if (short_mode)
12706           src = gen_lowpart (wmode, src);
12707
12708         neg_src = gen_rtx_NEG (wmode, src);
12709         emit_insn (gen_rtx_SET (src, neg_src));
12710
12711         if (short_mode)
12712           src = gen_lowpart (mode, src);
12713       }
12714       /* Fall-through.  */
12715     case PLUS:
12716       ldop_code = AARCH64_LDOP_PLUS;
12717       break;
12718
12719     case IOR:
12720       ldop_code = AARCH64_LDOP_OR;
12721       break;
12722
12723     case XOR:
12724       ldop_code = AARCH64_LDOP_XOR;
12725       break;
12726
12727     case AND:
12728       {
12729         rtx not_src;
12730
12731         /* Resize the value if necessary.  */
12732         if (short_mode)
12733           src = gen_lowpart (wmode, src);
12734
12735         not_src = gen_rtx_NOT (wmode, src);
12736         emit_insn (gen_rtx_SET (src, not_src));
12737
12738         if (short_mode)
12739           src = gen_lowpart (mode, src);
12740       }
12741       ldop_code = AARCH64_LDOP_BIC;
12742       break;
12743
12744     default:
12745       /* The operation can't be done with atomic instructions.  */
12746       gcc_unreachable ();
12747     }
12748
12749   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12750
12751   /* If necessary, calculate the data in memory after the update by redoing the
12752      operation from values in registers.  */
12753   if (!out_result)
12754     return;
12755
12756   if (short_mode)
12757     {
12758       src = gen_lowpart (wmode, src);
12759       out_data = gen_lowpart (wmode, out_data);
12760       out_result = gen_lowpart (wmode, out_result);
12761     }
12762
12763   x = NULL_RTX;
12764
12765   switch (code)
12766     {
12767     case MINUS:
12768     case PLUS:
12769       x = gen_rtx_PLUS (wmode, out_data, src);
12770       break;
12771     case IOR:
12772       x = gen_rtx_IOR (wmode, out_data, src);
12773       break;
12774     case XOR:
12775       x = gen_rtx_XOR (wmode, out_data, src);
12776       break;
12777     case AND:
12778       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12779       return;
12780     default:
12781       gcc_unreachable ();
12782     }
12783
12784   emit_set_insn (out_result, x);
12785
12786   return;
12787 }
12788
12789 /* Split an atomic operation.  */
12790
12791 void
12792 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12793                          rtx value, rtx model_rtx, rtx cond)
12794 {
12795   machine_mode mode = GET_MODE (mem);
12796   machine_mode wmode = (mode == DImode ? DImode : SImode);
12797   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12798   const bool is_sync = is_mm_sync (model);
12799   rtx_code_label *label;
12800   rtx x;
12801
12802   /* Split the atomic operation into a sequence.  */
12803   label = gen_label_rtx ();
12804   emit_label (label);
12805
12806   if (new_out)
12807     new_out = gen_lowpart (wmode, new_out);
12808   if (old_out)
12809     old_out = gen_lowpart (wmode, old_out);
12810   else
12811     old_out = new_out;
12812   value = simplify_gen_subreg (wmode, value, mode, 0);
12813
12814   /* The initial load can be relaxed for a __sync operation since a final
12815      barrier will be emitted to stop code hoisting.  */
12816  if (is_sync)
12817     aarch64_emit_load_exclusive (mode, old_out, mem,
12818                                  GEN_INT (MEMMODEL_RELAXED));
12819   else
12820     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12821
12822   switch (code)
12823     {
12824     case SET:
12825       new_out = value;
12826       break;
12827
12828     case NOT:
12829       x = gen_rtx_AND (wmode, old_out, value);
12830       emit_insn (gen_rtx_SET (new_out, x));
12831       x = gen_rtx_NOT (wmode, new_out);
12832       emit_insn (gen_rtx_SET (new_out, x));
12833       break;
12834
12835     case MINUS:
12836       if (CONST_INT_P (value))
12837         {
12838           value = GEN_INT (-INTVAL (value));
12839           code = PLUS;
12840         }
12841       /* Fall through.  */
12842
12843     default:
12844       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12845       emit_insn (gen_rtx_SET (new_out, x));
12846       break;
12847     }
12848
12849   aarch64_emit_store_exclusive (mode, cond, mem,
12850                                 gen_lowpart (mode, new_out), model_rtx);
12851
12852   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12853   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12854                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12855   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12856
12857   /* Emit any final barrier needed for a __sync operation.  */
12858   if (is_sync)
12859     aarch64_emit_post_barrier (model);
12860 }
12861
12862 static void
12863 aarch64_init_libfuncs (void)
12864 {
12865    /* Half-precision float operations.  The compiler handles all operations
12866      with NULL libfuncs by converting to SFmode.  */
12867
12868   /* Conversions.  */
12869   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12870   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12871
12872   /* Arithmetic.  */
12873   set_optab_libfunc (add_optab, HFmode, NULL);
12874   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12875   set_optab_libfunc (smul_optab, HFmode, NULL);
12876   set_optab_libfunc (neg_optab, HFmode, NULL);
12877   set_optab_libfunc (sub_optab, HFmode, NULL);
12878
12879   /* Comparisons.  */
12880   set_optab_libfunc (eq_optab, HFmode, NULL);
12881   set_optab_libfunc (ne_optab, HFmode, NULL);
12882   set_optab_libfunc (lt_optab, HFmode, NULL);
12883   set_optab_libfunc (le_optab, HFmode, NULL);
12884   set_optab_libfunc (ge_optab, HFmode, NULL);
12885   set_optab_libfunc (gt_optab, HFmode, NULL);
12886   set_optab_libfunc (unord_optab, HFmode, NULL);
12887 }
12888
12889 /* Target hook for c_mode_for_suffix.  */
12890 static machine_mode
12891 aarch64_c_mode_for_suffix (char suffix)
12892 {
12893   if (suffix == 'q')
12894     return TFmode;
12895
12896   return VOIDmode;
12897 }
12898
12899 /* We can only represent floating point constants which will fit in
12900    "quarter-precision" values.  These values are characterised by
12901    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12902    by:
12903
12904    (-1)^s * (n/16) * 2^r
12905
12906    Where:
12907      's' is the sign bit.
12908      'n' is an integer in the range 16 <= n <= 31.
12909      'r' is an integer in the range -3 <= r <= 4.  */
12910
12911 /* Return true iff X can be represented by a quarter-precision
12912    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12913 bool
12914 aarch64_float_const_representable_p (rtx x)
12915 {
12916   /* This represents our current view of how many bits
12917      make up the mantissa.  */
12918   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12919   int exponent;
12920   unsigned HOST_WIDE_INT mantissa, mask;
12921   REAL_VALUE_TYPE r, m;
12922   bool fail;
12923
12924   if (!CONST_DOUBLE_P (x))
12925     return false;
12926
12927   /* We don't support HFmode constants yet.  */
12928   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12929     return false;
12930
12931   r = *CONST_DOUBLE_REAL_VALUE (x);
12932
12933   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12934      know if we have +zero until we analyse the mantissa, but we
12935      can reject the other invalid values.  */
12936   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12937       || REAL_VALUE_MINUS_ZERO (r))
12938     return false;
12939
12940   /* Extract exponent.  */
12941   r = real_value_abs (&r);
12942   exponent = REAL_EXP (&r);
12943
12944   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12945      highest (sign) bit, with a fixed binary point at bit point_pos.
12946      m1 holds the low part of the mantissa, m2 the high part.
12947      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12948      bits for the mantissa, this can fail (low bits will be lost).  */
12949   real_ldexp (&m, &r, point_pos - exponent);
12950   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12951
12952   /* If the low part of the mantissa has bits set we cannot represent
12953      the value.  */
12954   if (w.ulow () != 0)
12955     return false;
12956   /* We have rejected the lower HOST_WIDE_INT, so update our
12957      understanding of how many bits lie in the mantissa and
12958      look only at the high HOST_WIDE_INT.  */
12959   mantissa = w.elt (1);
12960   point_pos -= HOST_BITS_PER_WIDE_INT;
12961
12962   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12963   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12964   if ((mantissa & mask) != 0)
12965     return false;
12966
12967   /* Having filtered unrepresentable values, we may now remove all
12968      but the highest 5 bits.  */
12969   mantissa >>= point_pos - 5;
12970
12971   /* We cannot represent the value 0.0, so reject it.  This is handled
12972      elsewhere.  */
12973   if (mantissa == 0)
12974     return false;
12975
12976   /* Then, as bit 4 is always set, we can mask it off, leaving
12977      the mantissa in the range [0, 15].  */
12978   mantissa &= ~(1 << 4);
12979   gcc_assert (mantissa <= 15);
12980
12981   /* GCC internally does not use IEEE754-like encoding (where normalized
12982      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12983      Our mantissa values are shifted 4 places to the left relative to
12984      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12985      by 5 places to correct for GCC's representation.  */
12986   exponent = 5 - exponent;
12987
12988   return (exponent >= 0 && exponent <= 7);
12989 }
12990
12991 char*
12992 aarch64_output_simd_mov_immediate (rtx const_vector,
12993                                    machine_mode mode,
12994                                    unsigned width)
12995 {
12996   bool is_valid;
12997   static char templ[40];
12998   const char *mnemonic;
12999   const char *shift_op;
13000   unsigned int lane_count = 0;
13001   char element_char;
13002
13003   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13004
13005   /* This will return true to show const_vector is legal for use as either
13006      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
13007      also update INFO to show how the immediate should be generated.  */
13008   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13009   gcc_assert (is_valid);
13010
13011   element_char = sizetochar (info.element_width);
13012   lane_count = width / info.element_width;
13013
13014   mode = GET_MODE_INNER (mode);
13015   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13016     {
13017       gcc_assert (info.shift == 0 && ! info.mvn);
13018       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13019          move immediate path.  */
13020       if (aarch64_float_const_zero_rtx_p (info.value))
13021         info.value = GEN_INT (0);
13022       else
13023         {
13024           const unsigned int buf_size = 20;
13025           char float_buf[buf_size] = {'\0'};
13026           real_to_decimal_for_mode (float_buf,
13027                                     CONST_DOUBLE_REAL_VALUE (info.value),
13028                                     buf_size, buf_size, 1, mode);
13029
13030           if (lane_count == 1)
13031             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13032           else
13033             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13034                       lane_count, element_char, float_buf);
13035           return templ;
13036         }
13037     }
13038
13039   mnemonic = info.mvn ? "mvni" : "movi";
13040   shift_op = info.msl ? "msl" : "lsl";
13041
13042   gcc_assert (CONST_INT_P (info.value));
13043   if (lane_count == 1)
13044     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13045               mnemonic, UINTVAL (info.value));
13046   else if (info.shift)
13047     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13048               ", %s %d", mnemonic, lane_count, element_char,
13049               UINTVAL (info.value), shift_op, info.shift);
13050   else
13051     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13052               mnemonic, lane_count, element_char, UINTVAL (info.value));
13053   return templ;
13054 }
13055
13056 char*
13057 aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
13058 {
13059
13060   /* If a floating point number was passed and we desire to use it in an
13061      integer mode do the conversion to integer.  */
13062   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13063     {
13064       unsigned HOST_WIDE_INT ival;
13065       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13066           gcc_unreachable ();
13067       immediate = gen_int_mode (ival, mode);
13068     }
13069
13070   machine_mode vmode;
13071   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13072      a 128 bit vector mode.  */
13073   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13074
13075   gcc_assert (!VECTOR_MODE_P (mode));
13076   vmode = aarch64_simd_container_mode (mode, width);
13077   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13078   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13079 }
13080
13081 /* Split operands into moves from op[1] + op[2] into op[0].  */
13082
13083 void
13084 aarch64_split_combinev16qi (rtx operands[3])
13085 {
13086   unsigned int dest = REGNO (operands[0]);
13087   unsigned int src1 = REGNO (operands[1]);
13088   unsigned int src2 = REGNO (operands[2]);
13089   machine_mode halfmode = GET_MODE (operands[1]);
13090   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13091   rtx destlo, desthi;
13092
13093   gcc_assert (halfmode == V16QImode);
13094
13095   if (src1 == dest && src2 == dest + halfregs)
13096     {
13097       /* No-op move.  Can't split to nothing; emit something.  */
13098       emit_note (NOTE_INSN_DELETED);
13099       return;
13100     }
13101
13102   /* Preserve register attributes for variable tracking.  */
13103   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13104   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13105                                GET_MODE_SIZE (halfmode));
13106
13107   /* Special case of reversed high/low parts.  */
13108   if (reg_overlap_mentioned_p (operands[2], destlo)
13109       && reg_overlap_mentioned_p (operands[1], desthi))
13110     {
13111       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13112       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13113       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13114     }
13115   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13116     {
13117       /* Try to avoid unnecessary moves if part of the result
13118          is in the right place already.  */
13119       if (src1 != dest)
13120         emit_move_insn (destlo, operands[1]);
13121       if (src2 != dest + halfregs)
13122         emit_move_insn (desthi, operands[2]);
13123     }
13124   else
13125     {
13126       if (src2 != dest + halfregs)
13127         emit_move_insn (desthi, operands[2]);
13128       if (src1 != dest)
13129         emit_move_insn (destlo, operands[1]);
13130     }
13131 }
13132
13133 /* vec_perm support.  */
13134
13135 #define MAX_VECT_LEN 16
13136
13137 struct expand_vec_perm_d
13138 {
13139   rtx target, op0, op1;
13140   unsigned char perm[MAX_VECT_LEN];
13141   machine_mode vmode;
13142   unsigned char nelt;
13143   bool one_vector_p;
13144   bool testing_p;
13145 };
13146
13147 /* Generate a variable permutation.  */
13148
13149 static void
13150 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13151 {
13152   machine_mode vmode = GET_MODE (target);
13153   bool one_vector_p = rtx_equal_p (op0, op1);
13154
13155   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13156   gcc_checking_assert (GET_MODE (op0) == vmode);
13157   gcc_checking_assert (GET_MODE (op1) == vmode);
13158   gcc_checking_assert (GET_MODE (sel) == vmode);
13159   gcc_checking_assert (TARGET_SIMD);
13160
13161   if (one_vector_p)
13162     {
13163       if (vmode == V8QImode)
13164         {
13165           /* Expand the argument to a V16QI mode by duplicating it.  */
13166           rtx pair = gen_reg_rtx (V16QImode);
13167           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13168           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13169         }
13170       else
13171         {
13172           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13173         }
13174     }
13175   else
13176     {
13177       rtx pair;
13178
13179       if (vmode == V8QImode)
13180         {
13181           pair = gen_reg_rtx (V16QImode);
13182           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13183           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13184         }
13185       else
13186         {
13187           pair = gen_reg_rtx (OImode);
13188           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13189           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13190         }
13191     }
13192 }
13193
13194 void
13195 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13196 {
13197   machine_mode vmode = GET_MODE (target);
13198   unsigned int nelt = GET_MODE_NUNITS (vmode);
13199   bool one_vector_p = rtx_equal_p (op0, op1);
13200   rtx mask;
13201
13202   /* The TBL instruction does not use a modulo index, so we must take care
13203      of that ourselves.  */
13204   mask = aarch64_simd_gen_const_vector_dup (vmode,
13205       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13206   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13207
13208   /* For big-endian, we also need to reverse the index within the vector
13209      (but not which vector).  */
13210   if (BYTES_BIG_ENDIAN)
13211     {
13212       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13213       if (!one_vector_p)
13214         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13215       sel = expand_simple_binop (vmode, XOR, sel, mask,
13216                                  NULL, 0, OPTAB_LIB_WIDEN);
13217     }
13218   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13219 }
13220
13221 /* Recognize patterns suitable for the TRN instructions.  */
13222 static bool
13223 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13224 {
13225   unsigned int i, odd, mask, nelt = d->nelt;
13226   rtx out, in0, in1, x;
13227   rtx (*gen) (rtx, rtx, rtx);
13228   machine_mode vmode = d->vmode;
13229
13230   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13231     return false;
13232
13233   /* Note that these are little-endian tests.
13234      We correct for big-endian later.  */
13235   if (d->perm[0] == 0)
13236     odd = 0;
13237   else if (d->perm[0] == 1)
13238     odd = 1;
13239   else
13240     return false;
13241   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13242
13243   for (i = 0; i < nelt; i += 2)
13244     {
13245       if (d->perm[i] != i + odd)
13246         return false;
13247       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13248         return false;
13249     }
13250
13251   /* Success!  */
13252   if (d->testing_p)
13253     return true;
13254
13255   in0 = d->op0;
13256   in1 = d->op1;
13257   if (BYTES_BIG_ENDIAN)
13258     {
13259       x = in0, in0 = in1, in1 = x;
13260       odd = !odd;
13261     }
13262   out = d->target;
13263
13264   if (odd)
13265     {
13266       switch (vmode)
13267         {
13268         case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13269         case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13270         case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13271         case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13272         case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13273         case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13274         case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13275         case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13276         case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13277         case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13278         case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13279         case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13280         default:
13281           return false;
13282         }
13283     }
13284   else
13285     {
13286       switch (vmode)
13287         {
13288         case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13289         case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13290         case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13291         case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13292         case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13293         case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13294         case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13295         case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13296         case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13297         case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13298         case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13299         case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13300         default:
13301           return false;
13302         }
13303     }
13304
13305   emit_insn (gen (out, in0, in1));
13306   return true;
13307 }
13308
13309 /* Recognize patterns suitable for the UZP instructions.  */
13310 static bool
13311 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13312 {
13313   unsigned int i, odd, mask, nelt = d->nelt;
13314   rtx out, in0, in1, x;
13315   rtx (*gen) (rtx, rtx, rtx);
13316   machine_mode vmode = d->vmode;
13317
13318   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13319     return false;
13320
13321   /* Note that these are little-endian tests.
13322      We correct for big-endian later.  */
13323   if (d->perm[0] == 0)
13324     odd = 0;
13325   else if (d->perm[0] == 1)
13326     odd = 1;
13327   else
13328     return false;
13329   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13330
13331   for (i = 0; i < nelt; i++)
13332     {
13333       unsigned elt = (i * 2 + odd) & mask;
13334       if (d->perm[i] != elt)
13335         return false;
13336     }
13337
13338   /* Success!  */
13339   if (d->testing_p)
13340     return true;
13341
13342   in0 = d->op0;
13343   in1 = d->op1;
13344   if (BYTES_BIG_ENDIAN)
13345     {
13346       x = in0, in0 = in1, in1 = x;
13347       odd = !odd;
13348     }
13349   out = d->target;
13350
13351   if (odd)
13352     {
13353       switch (vmode)
13354         {
13355         case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13356         case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13357         case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13358         case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13359         case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13360         case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13361         case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13362         case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13363         case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13364         case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13365         case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13366         case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13367         default:
13368           return false;
13369         }
13370     }
13371   else
13372     {
13373       switch (vmode)
13374         {
13375         case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13376         case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13377         case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13378         case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13379         case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13380         case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13381         case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13382         case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13383         case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13384         case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13385         case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13386         case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13387         default:
13388           return false;
13389         }
13390     }
13391
13392   emit_insn (gen (out, in0, in1));
13393   return true;
13394 }
13395
13396 /* Recognize patterns suitable for the ZIP instructions.  */
13397 static bool
13398 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13399 {
13400   unsigned int i, high, mask, nelt = d->nelt;
13401   rtx out, in0, in1, x;
13402   rtx (*gen) (rtx, rtx, rtx);
13403   machine_mode vmode = d->vmode;
13404
13405   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13406     return false;
13407
13408   /* Note that these are little-endian tests.
13409      We correct for big-endian later.  */
13410   high = nelt / 2;
13411   if (d->perm[0] == high)
13412     /* Do Nothing.  */
13413     ;
13414   else if (d->perm[0] == 0)
13415     high = 0;
13416   else
13417     return false;
13418   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13419
13420   for (i = 0; i < nelt / 2; i++)
13421     {
13422       unsigned elt = (i + high) & mask;
13423       if (d->perm[i * 2] != elt)
13424         return false;
13425       elt = (elt + nelt) & mask;
13426       if (d->perm[i * 2 + 1] != elt)
13427         return false;
13428     }
13429
13430   /* Success!  */
13431   if (d->testing_p)
13432     return true;
13433
13434   in0 = d->op0;
13435   in1 = d->op1;
13436   if (BYTES_BIG_ENDIAN)
13437     {
13438       x = in0, in0 = in1, in1 = x;
13439       high = !high;
13440     }
13441   out = d->target;
13442
13443   if (high)
13444     {
13445       switch (vmode)
13446         {
13447         case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13448         case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13449         case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13450         case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13451         case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13452         case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13453         case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13454         case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13455         case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13456         case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13457         case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13458         case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13459         default:
13460           return false;
13461         }
13462     }
13463   else
13464     {
13465       switch (vmode)
13466         {
13467         case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13468         case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13469         case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13470         case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13471         case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13472         case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13473         case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13474         case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13475         case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13476         case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13477         case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13478         case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13479         default:
13480           return false;
13481         }
13482     }
13483
13484   emit_insn (gen (out, in0, in1));
13485   return true;
13486 }
13487
13488 /* Recognize patterns for the EXT insn.  */
13489
13490 static bool
13491 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13492 {
13493   unsigned int i, nelt = d->nelt;
13494   rtx (*gen) (rtx, rtx, rtx, rtx);
13495   rtx offset;
13496
13497   unsigned int location = d->perm[0]; /* Always < nelt.  */
13498
13499   /* Check if the extracted indices are increasing by one.  */
13500   for (i = 1; i < nelt; i++)
13501     {
13502       unsigned int required = location + i;
13503       if (d->one_vector_p)
13504         {
13505           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13506           required &= (nelt - 1);
13507         }
13508       if (d->perm[i] != required)
13509         return false;
13510     }
13511
13512   switch (d->vmode)
13513     {
13514     case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13515     case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13516     case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13517     case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13518     case E_V2SImode: gen = gen_aarch64_extv2si; break;
13519     case E_V4SImode: gen = gen_aarch64_extv4si; break;
13520     case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13521     case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13522     case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13523     case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13524     case E_V2DImode: gen = gen_aarch64_extv2di; break;
13525     case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13526     default:
13527       return false;
13528     }
13529
13530   /* Success! */
13531   if (d->testing_p)
13532     return true;
13533
13534   /* The case where (location == 0) is a no-op for both big- and little-endian,
13535      and is removed by the mid-end at optimization levels -O1 and higher.  */
13536
13537   if (BYTES_BIG_ENDIAN && (location != 0))
13538     {
13539       /* After setup, we want the high elements of the first vector (stored
13540          at the LSB end of the register), and the low elements of the second
13541          vector (stored at the MSB end of the register). So swap.  */
13542       std::swap (d->op0, d->op1);
13543       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13544       location = nelt - location;
13545     }
13546
13547   offset = GEN_INT (location);
13548   emit_insn (gen (d->target, d->op0, d->op1, offset));
13549   return true;
13550 }
13551
13552 /* Recognize patterns for the REV insns.  */
13553
13554 static bool
13555 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13556 {
13557   unsigned int i, j, diff, nelt = d->nelt;
13558   rtx (*gen) (rtx, rtx);
13559
13560   if (!d->one_vector_p)
13561     return false;
13562
13563   diff = d->perm[0];
13564   switch (diff)
13565     {
13566     case 7:
13567       switch (d->vmode)
13568         {
13569         case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13570         case E_V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13571         default:
13572           return false;
13573         }
13574       break;
13575     case 3:
13576       switch (d->vmode)
13577         {
13578         case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13579         case E_V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13580         case E_V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13581         case E_V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13582         default:
13583           return false;
13584         }
13585       break;
13586     case 1:
13587       switch (d->vmode)
13588         {
13589         case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13590         case E_V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13591         case E_V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13592         case E_V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13593         case E_V4SImode: gen = gen_aarch64_rev64v4si;  break;
13594         case E_V2SImode: gen = gen_aarch64_rev64v2si;  break;
13595         case E_V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13596         case E_V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13597         case E_V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13598         case E_V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13599         default:
13600           return false;
13601         }
13602       break;
13603     default:
13604       return false;
13605     }
13606
13607   for (i = 0; i < nelt ; i += diff + 1)
13608     for (j = 0; j <= diff; j += 1)
13609       {
13610         /* This is guaranteed to be true as the value of diff
13611            is 7, 3, 1 and we should have enough elements in the
13612            queue to generate this.  Getting a vector mask with a
13613            value of diff other than these values implies that
13614            something is wrong by the time we get here.  */
13615         gcc_assert (i + j < nelt);
13616         if (d->perm[i + j] != i + diff - j)
13617           return false;
13618       }
13619
13620   /* Success! */
13621   if (d->testing_p)
13622     return true;
13623
13624   emit_insn (gen (d->target, d->op0));
13625   return true;
13626 }
13627
13628 static bool
13629 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13630 {
13631   rtx (*gen) (rtx, rtx, rtx);
13632   rtx out = d->target;
13633   rtx in0;
13634   machine_mode vmode = d->vmode;
13635   unsigned int i, elt, nelt = d->nelt;
13636   rtx lane;
13637
13638   elt = d->perm[0];
13639   for (i = 1; i < nelt; i++)
13640     {
13641       if (elt != d->perm[i])
13642         return false;
13643     }
13644
13645   /* The generic preparation in aarch64_expand_vec_perm_const_1
13646      swaps the operand order and the permute indices if it finds
13647      d->perm[0] to be in the second operand.  Thus, we can always
13648      use d->op0 and need not do any extra arithmetic to get the
13649      correct lane number.  */
13650   in0 = d->op0;
13651   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13652
13653   switch (vmode)
13654     {
13655     case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13656     case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13657     case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13658     case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13659     case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13660     case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13661     case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13662     case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13663     case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13664     case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13665     case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13666     case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13667     default:
13668       return false;
13669     }
13670
13671   emit_insn (gen (out, in0, lane));
13672   return true;
13673 }
13674
13675 static bool
13676 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13677 {
13678   rtx rperm[MAX_VECT_LEN], sel;
13679   machine_mode vmode = d->vmode;
13680   unsigned int i, nelt = d->nelt;
13681
13682   if (d->testing_p)
13683     return true;
13684
13685   /* Generic code will try constant permutation twice.  Once with the
13686      original mode and again with the elements lowered to QImode.
13687      So wait and don't do the selector expansion ourselves.  */
13688   if (vmode != V8QImode && vmode != V16QImode)
13689     return false;
13690
13691   for (i = 0; i < nelt; ++i)
13692     {
13693       int nunits = GET_MODE_NUNITS (vmode);
13694
13695       /* If big-endian and two vectors we end up with a weird mixed-endian
13696          mode on NEON.  Reverse the index within each word but not the word
13697          itself.  */
13698       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13699                                            : d->perm[i]);
13700     }
13701   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13702   sel = force_reg (vmode, sel);
13703
13704   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13705   return true;
13706 }
13707
13708 static bool
13709 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13710 {
13711   /* The pattern matching functions above are written to look for a small
13712      number to begin the sequence (0, 1, N/2).  If we begin with an index
13713      from the second operand, we can swap the operands.  */
13714   if (d->perm[0] >= d->nelt)
13715     {
13716       unsigned i, nelt = d->nelt;
13717
13718       gcc_assert (nelt == (nelt & -nelt));
13719       for (i = 0; i < nelt; ++i)
13720         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13721
13722       std::swap (d->op0, d->op1);
13723     }
13724
13725   if (TARGET_SIMD)
13726     {
13727       if (aarch64_evpc_rev (d))
13728         return true;
13729       else if (aarch64_evpc_ext (d))
13730         return true;
13731       else if (aarch64_evpc_dup (d))
13732         return true;
13733       else if (aarch64_evpc_zip (d))
13734         return true;
13735       else if (aarch64_evpc_uzp (d))
13736         return true;
13737       else if (aarch64_evpc_trn (d))
13738         return true;
13739       return aarch64_evpc_tbl (d);
13740     }
13741   return false;
13742 }
13743
13744 /* Expand a vec_perm_const pattern.  */
13745
13746 bool
13747 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13748 {
13749   struct expand_vec_perm_d d;
13750   int i, nelt, which;
13751
13752   d.target = target;
13753   d.op0 = op0;
13754   d.op1 = op1;
13755
13756   d.vmode = GET_MODE (target);
13757   gcc_assert (VECTOR_MODE_P (d.vmode));
13758   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13759   d.testing_p = false;
13760
13761   for (i = which = 0; i < nelt; ++i)
13762     {
13763       rtx e = XVECEXP (sel, 0, i);
13764       int ei = INTVAL (e) & (2 * nelt - 1);
13765       which |= (ei < nelt ? 1 : 2);
13766       d.perm[i] = ei;
13767     }
13768
13769   switch (which)
13770     {
13771     default:
13772       gcc_unreachable ();
13773
13774     case 3:
13775       d.one_vector_p = false;
13776       if (!rtx_equal_p (op0, op1))
13777         break;
13778
13779       /* The elements of PERM do not suggest that only the first operand
13780          is used, but both operands are identical.  Allow easier matching
13781          of the permutation by folding the permutation into the single
13782          input vector.  */
13783       /* Fall Through.  */
13784     case 2:
13785       for (i = 0; i < nelt; ++i)
13786         d.perm[i] &= nelt - 1;
13787       d.op0 = op1;
13788       d.one_vector_p = true;
13789       break;
13790
13791     case 1:
13792       d.op1 = op0;
13793       d.one_vector_p = true;
13794       break;
13795     }
13796
13797   return aarch64_expand_vec_perm_const_1 (&d);
13798 }
13799
13800 static bool
13801 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13802                                      const unsigned char *sel)
13803 {
13804   struct expand_vec_perm_d d;
13805   unsigned int i, nelt, which;
13806   bool ret;
13807
13808   d.vmode = vmode;
13809   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13810   d.testing_p = true;
13811   memcpy (d.perm, sel, nelt);
13812
13813   /* Calculate whether all elements are in one vector.  */
13814   for (i = which = 0; i < nelt; ++i)
13815     {
13816       unsigned char e = d.perm[i];
13817       gcc_assert (e < 2 * nelt);
13818       which |= (e < nelt ? 1 : 2);
13819     }
13820
13821   /* If all elements are from the second vector, reindex as if from the
13822      first vector.  */
13823   if (which == 2)
13824     for (i = 0; i < nelt; ++i)
13825       d.perm[i] -= nelt;
13826
13827   /* Check whether the mask can be applied to a single vector.  */
13828   d.one_vector_p = (which != 3);
13829
13830   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13831   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13832   if (!d.one_vector_p)
13833     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13834
13835   start_sequence ();
13836   ret = aarch64_expand_vec_perm_const_1 (&d);
13837   end_sequence ();
13838
13839   return ret;
13840 }
13841
13842 rtx
13843 aarch64_reverse_mask (machine_mode mode)
13844 {
13845   /* We have to reverse each vector because we dont have
13846      a permuted load that can reverse-load according to ABI rules.  */
13847   rtx mask;
13848   rtvec v = rtvec_alloc (16);
13849   int i, j;
13850   int nunits = GET_MODE_NUNITS (mode);
13851   int usize = GET_MODE_UNIT_SIZE (mode);
13852
13853   gcc_assert (BYTES_BIG_ENDIAN);
13854   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13855
13856   for (i = 0; i < nunits; i++)
13857     for (j = 0; j < usize; j++)
13858       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13859   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13860   return force_reg (V16QImode, mask);
13861 }
13862
13863 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
13864    true.  However due to issues with register allocation it is preferable
13865    to avoid tieing integer scalar and FP scalar modes.  Executing integer
13866    operations in general registers is better than treating them as scalar
13867    vector operations.  This reduces latency and avoids redundant int<->FP
13868    moves.  So tie modes if they are either the same class, or vector modes
13869    with other vector modes, vector structs or any scalar mode.  */
13870
13871 static bool
13872 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13873 {
13874   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13875     return true;
13876
13877   /* We specifically want to allow elements of "structure" modes to
13878      be tieable to the structure.  This more general condition allows
13879      other rarer situations too.  */
13880   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13881     return true;
13882
13883   /* Also allow any scalar modes with vectors.  */
13884   if (aarch64_vector_mode_supported_p (mode1)
13885       || aarch64_vector_mode_supported_p (mode2))
13886     return true;
13887
13888   return false;
13889 }
13890
13891 /* Return a new RTX holding the result of moving POINTER forward by
13892    AMOUNT bytes.  */
13893
13894 static rtx
13895 aarch64_move_pointer (rtx pointer, int amount)
13896 {
13897   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13898
13899   return adjust_automodify_address (pointer, GET_MODE (pointer),
13900                                     next, amount);
13901 }
13902
13903 /* Return a new RTX holding the result of moving POINTER forward by the
13904    size of the mode it points to.  */
13905
13906 static rtx
13907 aarch64_progress_pointer (rtx pointer)
13908 {
13909   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13910
13911   return aarch64_move_pointer (pointer, amount);
13912 }
13913
13914 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13915    MODE bytes.  */
13916
13917 static void
13918 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13919                                               machine_mode mode)
13920 {
13921   rtx reg = gen_reg_rtx (mode);
13922
13923   /* "Cast" the pointers to the correct mode.  */
13924   *src = adjust_address (*src, mode, 0);
13925   *dst = adjust_address (*dst, mode, 0);
13926   /* Emit the memcpy.  */
13927   emit_move_insn (reg, *src);
13928   emit_move_insn (*dst, reg);
13929   /* Move the pointers forward.  */
13930   *src = aarch64_progress_pointer (*src);
13931   *dst = aarch64_progress_pointer (*dst);
13932 }
13933
13934 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13935    we succeed, otherwise return false.  */
13936
13937 bool
13938 aarch64_expand_movmem (rtx *operands)
13939 {
13940   unsigned int n;
13941   rtx dst = operands[0];
13942   rtx src = operands[1];
13943   rtx base;
13944   bool speed_p = !optimize_function_for_size_p (cfun);
13945
13946   /* When optimizing for size, give a better estimate of the length of a
13947      memcpy call, but use the default otherwise.  */
13948   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13949
13950   /* We can't do anything smart if the amount to copy is not constant.  */
13951   if (!CONST_INT_P (operands[2]))
13952     return false;
13953
13954   n = UINTVAL (operands[2]);
13955
13956   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13957      need to make at most two moves.  For cases above 16 bytes it will be one
13958      move for each 16 byte chunk, then at most two additional moves.  */
13959   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13960     return false;
13961
13962   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13963   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13964
13965   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13966   src = adjust_automodify_address (src, VOIDmode, base, 0);
13967
13968   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13969      1-byte chunk.  */
13970   if (n < 4)
13971     {
13972       if (n >= 2)
13973         {
13974           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13975           n -= 2;
13976         }
13977
13978       if (n == 1)
13979         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13980
13981       return true;
13982     }
13983
13984   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13985      4-byte chunk, partially overlapping with the previously copied chunk.  */
13986   if (n < 8)
13987     {
13988       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13989       n -= 4;
13990       if (n > 0)
13991         {
13992           int move = n - 4;
13993
13994           src = aarch64_move_pointer (src, move);
13995           dst = aarch64_move_pointer (dst, move);
13996           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13997         }
13998       return true;
13999     }
14000
14001   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
14002      them, then (if applicable) an 8-byte chunk.  */
14003   while (n >= 8)
14004     {
14005       if (n / 16)
14006         {
14007           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14008           n -= 16;
14009         }
14010       else
14011         {
14012           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14013           n -= 8;
14014         }
14015     }
14016
14017   /* Finish the final bytes of the copy.  We can always do this in one
14018      instruction.  We either copy the exact amount we need, or partially
14019      overlap with the previous chunk we copied and copy 8-bytes.  */
14020   if (n == 0)
14021     return true;
14022   else if (n == 1)
14023     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14024   else if (n == 2)
14025     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14026   else if (n == 4)
14027     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14028   else
14029     {
14030       if (n == 3)
14031         {
14032           src = aarch64_move_pointer (src, -1);
14033           dst = aarch64_move_pointer (dst, -1);
14034           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14035         }
14036       else
14037         {
14038           int move = n - 8;
14039
14040           src = aarch64_move_pointer (src, move);
14041           dst = aarch64_move_pointer (dst, move);
14042           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14043         }
14044     }
14045
14046   return true;
14047 }
14048
14049 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14050    SImode stores.  Handle the case when the constant has identical
14051    bottom and top halves.  This is beneficial when the two stores can be
14052    merged into an STP and we avoid synthesising potentially expensive
14053    immediates twice.  Return true if such a split is possible.  */
14054
14055 bool
14056 aarch64_split_dimode_const_store (rtx dst, rtx src)
14057 {
14058   rtx lo = gen_lowpart (SImode, src);
14059   rtx hi = gen_highpart_mode (SImode, DImode, src);
14060
14061   bool size_p = optimize_function_for_size_p (cfun);
14062
14063   if (!rtx_equal_p (lo, hi))
14064     return false;
14065
14066   unsigned int orig_cost
14067     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14068   unsigned int lo_cost
14069     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14070
14071   /* We want to transform:
14072      MOV        x1, 49370
14073      MOVK       x1, 0x140, lsl 16
14074      MOVK       x1, 0xc0da, lsl 32
14075      MOVK       x1, 0x140, lsl 48
14076      STR        x1, [x0]
14077    into:
14078      MOV        w1, 49370
14079      MOVK       w1, 0x140, lsl 16
14080      STP        w1, w1, [x0]
14081    So we want to perform this only when we save two instructions
14082    or more.  When optimizing for size, however, accept any code size
14083    savings we can.  */
14084   if (size_p && orig_cost <= lo_cost)
14085     return false;
14086
14087   if (!size_p
14088       && (orig_cost <= lo_cost + 1))
14089     return false;
14090
14091   rtx mem_lo = adjust_address (dst, SImode, 0);
14092   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14093     return false;
14094
14095   rtx tmp_reg = gen_reg_rtx (SImode);
14096   aarch64_expand_mov_immediate (tmp_reg, lo);
14097   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14098   /* Don't emit an explicit store pair as this may not be always profitable.
14099      Let the sched-fusion logic decide whether to merge them.  */
14100   emit_move_insn (mem_lo, tmp_reg);
14101   emit_move_insn (mem_hi, tmp_reg);
14102
14103   return true;
14104 }
14105
14106 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14107
14108 static unsigned HOST_WIDE_INT
14109 aarch64_asan_shadow_offset (void)
14110 {
14111   return (HOST_WIDE_INT_1 << 36);
14112 }
14113
14114 static bool
14115 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14116                                         unsigned int align,
14117                                         enum by_pieces_operation op,
14118                                         bool speed_p)
14119 {
14120   /* STORE_BY_PIECES can be used when copying a constant string, but
14121      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14122      For now we always fail this and let the move_by_pieces code copy
14123      the string from read-only memory.  */
14124   if (op == STORE_BY_PIECES)
14125     return false;
14126
14127   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14128 }
14129
14130 static rtx
14131 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14132                         int code, tree treeop0, tree treeop1)
14133 {
14134   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14135   rtx op0, op1;
14136   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14137   insn_code icode;
14138   struct expand_operand ops[4];
14139
14140   start_sequence ();
14141   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14142
14143   op_mode = GET_MODE (op0);
14144   if (op_mode == VOIDmode)
14145     op_mode = GET_MODE (op1);
14146
14147   switch (op_mode)
14148     {
14149     case E_QImode:
14150     case E_HImode:
14151     case E_SImode:
14152       cmp_mode = SImode;
14153       icode = CODE_FOR_cmpsi;
14154       break;
14155
14156     case E_DImode:
14157       cmp_mode = DImode;
14158       icode = CODE_FOR_cmpdi;
14159       break;
14160
14161     case E_SFmode:
14162       cmp_mode = SFmode;
14163       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14164       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14165       break;
14166
14167     case E_DFmode:
14168       cmp_mode = DFmode;
14169       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14170       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14171       break;
14172
14173     default:
14174       end_sequence ();
14175       return NULL_RTX;
14176     }
14177
14178   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14179   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14180   if (!op0 || !op1)
14181     {
14182       end_sequence ();
14183       return NULL_RTX;
14184     }
14185   *prep_seq = get_insns ();
14186   end_sequence ();
14187
14188   create_fixed_operand (&ops[0], op0);
14189   create_fixed_operand (&ops[1], op1);
14190
14191   start_sequence ();
14192   if (!maybe_expand_insn (icode, 2, ops))
14193     {
14194       end_sequence ();
14195       return NULL_RTX;
14196     }
14197   *gen_seq = get_insns ();
14198   end_sequence ();
14199
14200   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14201                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14202 }
14203
14204 static rtx
14205 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14206                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14207 {
14208   rtx op0, op1, target;
14209   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14210   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14211   insn_code icode;
14212   struct expand_operand ops[6];
14213   int aarch64_cond;
14214
14215   push_to_sequence (*prep_seq);
14216   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14217
14218   op_mode = GET_MODE (op0);
14219   if (op_mode == VOIDmode)
14220     op_mode = GET_MODE (op1);
14221
14222   switch (op_mode)
14223     {
14224     case E_QImode:
14225     case E_HImode:
14226     case E_SImode:
14227       cmp_mode = SImode;
14228       icode = CODE_FOR_ccmpsi;
14229       break;
14230
14231     case E_DImode:
14232       cmp_mode = DImode;
14233       icode = CODE_FOR_ccmpdi;
14234       break;
14235
14236     case E_SFmode:
14237       cmp_mode = SFmode;
14238       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14239       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14240       break;
14241
14242     case E_DFmode:
14243       cmp_mode = DFmode;
14244       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14245       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14246       break;
14247
14248     default:
14249       end_sequence ();
14250       return NULL_RTX;
14251     }
14252
14253   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14254   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14255   if (!op0 || !op1)
14256     {
14257       end_sequence ();
14258       return NULL_RTX;
14259     }
14260   *prep_seq = get_insns ();
14261   end_sequence ();
14262
14263   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14264   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14265
14266   if (bit_code != AND)
14267     {
14268       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14269                                                 GET_MODE (XEXP (prev, 0))),
14270                              VOIDmode, XEXP (prev, 0), const0_rtx);
14271       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14272     }
14273
14274   create_fixed_operand (&ops[0], XEXP (prev, 0));
14275   create_fixed_operand (&ops[1], target);
14276   create_fixed_operand (&ops[2], op0);
14277   create_fixed_operand (&ops[3], op1);
14278   create_fixed_operand (&ops[4], prev);
14279   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14280
14281   push_to_sequence (*gen_seq);
14282   if (!maybe_expand_insn (icode, 6, ops))
14283     {
14284       end_sequence ();
14285       return NULL_RTX;
14286     }
14287
14288   *gen_seq = get_insns ();
14289   end_sequence ();
14290
14291   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14292 }
14293
14294 #undef TARGET_GEN_CCMP_FIRST
14295 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14296
14297 #undef TARGET_GEN_CCMP_NEXT
14298 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14299
14300 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14301    instruction fusion of some sort.  */
14302
14303 static bool
14304 aarch64_macro_fusion_p (void)
14305 {
14306   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14307 }
14308
14309
14310 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14311    should be kept together during scheduling.  */
14312
14313 static bool
14314 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14315 {
14316   rtx set_dest;
14317   rtx prev_set = single_set (prev);
14318   rtx curr_set = single_set (curr);
14319   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14320   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14321
14322   if (!aarch64_macro_fusion_p ())
14323     return false;
14324
14325   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14326     {
14327       /* We are trying to match:
14328          prev (mov)  == (set (reg r0) (const_int imm16))
14329          curr (movk) == (set (zero_extract (reg r0)
14330                                            (const_int 16)
14331                                            (const_int 16))
14332                              (const_int imm16_1))  */
14333
14334       set_dest = SET_DEST (curr_set);
14335
14336       if (GET_CODE (set_dest) == ZERO_EXTRACT
14337           && CONST_INT_P (SET_SRC (curr_set))
14338           && CONST_INT_P (SET_SRC (prev_set))
14339           && CONST_INT_P (XEXP (set_dest, 2))
14340           && INTVAL (XEXP (set_dest, 2)) == 16
14341           && REG_P (XEXP (set_dest, 0))
14342           && REG_P (SET_DEST (prev_set))
14343           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14344         {
14345           return true;
14346         }
14347     }
14348
14349   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14350     {
14351
14352       /*  We're trying to match:
14353           prev (adrp) == (set (reg r1)
14354                               (high (symbol_ref ("SYM"))))
14355           curr (add) == (set (reg r0)
14356                              (lo_sum (reg r1)
14357                                      (symbol_ref ("SYM"))))
14358           Note that r0 need not necessarily be the same as r1, especially
14359           during pre-regalloc scheduling.  */
14360
14361       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14362           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14363         {
14364           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14365               && REG_P (XEXP (SET_SRC (curr_set), 0))
14366               && REGNO (XEXP (SET_SRC (curr_set), 0))
14367                  == REGNO (SET_DEST (prev_set))
14368               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14369                               XEXP (SET_SRC (curr_set), 1)))
14370             return true;
14371         }
14372     }
14373
14374   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14375     {
14376
14377       /* We're trying to match:
14378          prev (movk) == (set (zero_extract (reg r0)
14379                                            (const_int 16)
14380                                            (const_int 32))
14381                              (const_int imm16_1))
14382          curr (movk) == (set (zero_extract (reg r0)
14383                                            (const_int 16)
14384                                            (const_int 48))
14385                              (const_int imm16_2))  */
14386
14387       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14388           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14389           && REG_P (XEXP (SET_DEST (prev_set), 0))
14390           && REG_P (XEXP (SET_DEST (curr_set), 0))
14391           && REGNO (XEXP (SET_DEST (prev_set), 0))
14392              == REGNO (XEXP (SET_DEST (curr_set), 0))
14393           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14394           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14395           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14396           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14397           && CONST_INT_P (SET_SRC (prev_set))
14398           && CONST_INT_P (SET_SRC (curr_set)))
14399         return true;
14400
14401     }
14402   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14403     {
14404       /* We're trying to match:
14405           prev (adrp) == (set (reg r0)
14406                               (high (symbol_ref ("SYM"))))
14407           curr (ldr) == (set (reg r1)
14408                              (mem (lo_sum (reg r0)
14409                                              (symbol_ref ("SYM")))))
14410                  or
14411           curr (ldr) == (set (reg r1)
14412                              (zero_extend (mem
14413                                            (lo_sum (reg r0)
14414                                                    (symbol_ref ("SYM"))))))  */
14415       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14416           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14417         {
14418           rtx curr_src = SET_SRC (curr_set);
14419
14420           if (GET_CODE (curr_src) == ZERO_EXTEND)
14421             curr_src = XEXP (curr_src, 0);
14422
14423           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14424               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14425               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14426                  == REGNO (SET_DEST (prev_set))
14427               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14428                               XEXP (SET_SRC (prev_set), 0)))
14429               return true;
14430         }
14431     }
14432
14433   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14434        && aarch_crypto_can_dual_issue (prev, curr))
14435     return true;
14436
14437   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14438       && any_condjump_p (curr))
14439     {
14440       enum attr_type prev_type = get_attr_type (prev);
14441
14442       unsigned int condreg1, condreg2;
14443       rtx cc_reg_1;
14444       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14445       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14446
14447       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14448           && prev
14449           && modified_in_p (cc_reg_1, prev))
14450         {
14451           /* FIXME: this misses some which is considered simple arthematic
14452              instructions for ThunderX.  Simple shifts are missed here.  */
14453           if (prev_type == TYPE_ALUS_SREG
14454               || prev_type == TYPE_ALUS_IMM
14455               || prev_type == TYPE_LOGICS_REG
14456               || prev_type == TYPE_LOGICS_IMM)
14457             return true;
14458         }
14459     }
14460
14461   if (prev_set
14462       && curr_set
14463       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14464       && any_condjump_p (curr))
14465     {
14466       /* We're trying to match:
14467           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14468           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14469                                                          (const_int 0))
14470                                                  (label_ref ("SYM"))
14471                                                  (pc))  */
14472       if (SET_DEST (curr_set) == (pc_rtx)
14473           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14474           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14475           && REG_P (SET_DEST (prev_set))
14476           && REGNO (SET_DEST (prev_set))
14477              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14478         {
14479           /* Fuse ALU operations followed by conditional branch instruction.  */
14480           switch (get_attr_type (prev))
14481             {
14482             case TYPE_ALU_IMM:
14483             case TYPE_ALU_SREG:
14484             case TYPE_ADC_REG:
14485             case TYPE_ADC_IMM:
14486             case TYPE_ADCS_REG:
14487             case TYPE_ADCS_IMM:
14488             case TYPE_LOGIC_REG:
14489             case TYPE_LOGIC_IMM:
14490             case TYPE_CSEL:
14491             case TYPE_ADR:
14492             case TYPE_MOV_IMM:
14493             case TYPE_SHIFT_REG:
14494             case TYPE_SHIFT_IMM:
14495             case TYPE_BFM:
14496             case TYPE_RBIT:
14497             case TYPE_REV:
14498             case TYPE_EXTEND:
14499               return true;
14500
14501             default:;
14502             }
14503         }
14504     }
14505
14506   return false;
14507 }
14508
14509 /* Return true iff the instruction fusion described by OP is enabled.  */
14510
14511 bool
14512 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14513 {
14514   return (aarch64_tune_params.fusible_ops & op) != 0;
14515 }
14516
14517 /* If MEM is in the form of [base+offset], extract the two parts
14518    of address and set to BASE and OFFSET, otherwise return false
14519    after clearing BASE and OFFSET.  */
14520
14521 bool
14522 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14523 {
14524   rtx addr;
14525
14526   gcc_assert (MEM_P (mem));
14527
14528   addr = XEXP (mem, 0);
14529
14530   if (REG_P (addr))
14531     {
14532       *base = addr;
14533       *offset = const0_rtx;
14534       return true;
14535     }
14536
14537   if (GET_CODE (addr) == PLUS
14538       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14539     {
14540       *base = XEXP (addr, 0);
14541       *offset = XEXP (addr, 1);
14542       return true;
14543     }
14544
14545   *base = NULL_RTX;
14546   *offset = NULL_RTX;
14547
14548   return false;
14549 }
14550
14551 /* Types for scheduling fusion.  */
14552 enum sched_fusion_type
14553 {
14554   SCHED_FUSION_NONE = 0,
14555   SCHED_FUSION_LD_SIGN_EXTEND,
14556   SCHED_FUSION_LD_ZERO_EXTEND,
14557   SCHED_FUSION_LD,
14558   SCHED_FUSION_ST,
14559   SCHED_FUSION_NUM
14560 };
14561
14562 /* If INSN is a load or store of address in the form of [base+offset],
14563    extract the two parts and set to BASE and OFFSET.  Return scheduling
14564    fusion type this INSN is.  */
14565
14566 static enum sched_fusion_type
14567 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14568 {
14569   rtx x, dest, src;
14570   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14571
14572   gcc_assert (INSN_P (insn));
14573   x = PATTERN (insn);
14574   if (GET_CODE (x) != SET)
14575     return SCHED_FUSION_NONE;
14576
14577   src = SET_SRC (x);
14578   dest = SET_DEST (x);
14579
14580   machine_mode dest_mode = GET_MODE (dest);
14581
14582   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14583     return SCHED_FUSION_NONE;
14584
14585   if (GET_CODE (src) == SIGN_EXTEND)
14586     {
14587       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14588       src = XEXP (src, 0);
14589       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14590         return SCHED_FUSION_NONE;
14591     }
14592   else if (GET_CODE (src) == ZERO_EXTEND)
14593     {
14594       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14595       src = XEXP (src, 0);
14596       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14597         return SCHED_FUSION_NONE;
14598     }
14599
14600   if (GET_CODE (src) == MEM && REG_P (dest))
14601     extract_base_offset_in_addr (src, base, offset);
14602   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14603     {
14604       fusion = SCHED_FUSION_ST;
14605       extract_base_offset_in_addr (dest, base, offset);
14606     }
14607   else
14608     return SCHED_FUSION_NONE;
14609
14610   if (*base == NULL_RTX || *offset == NULL_RTX)
14611     fusion = SCHED_FUSION_NONE;
14612
14613   return fusion;
14614 }
14615
14616 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14617
14618    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14619    and PRI are only calculated for these instructions.  For other instruction,
14620    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14621    type instruction fusion can be added by returning different priorities.
14622
14623    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14624
14625 static void
14626 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14627                                int *fusion_pri, int *pri)
14628 {
14629   int tmp, off_val;
14630   rtx base, offset;
14631   enum sched_fusion_type fusion;
14632
14633   gcc_assert (INSN_P (insn));
14634
14635   tmp = max_pri - 1;
14636   fusion = fusion_load_store (insn, &base, &offset);
14637   if (fusion == SCHED_FUSION_NONE)
14638     {
14639       *pri = tmp;
14640       *fusion_pri = tmp;
14641       return;
14642     }
14643
14644   /* Set FUSION_PRI according to fusion type and base register.  */
14645   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14646
14647   /* Calculate PRI.  */
14648   tmp /= 2;
14649
14650   /* INSN with smaller offset goes first.  */
14651   off_val = (int)(INTVAL (offset));
14652   if (off_val >= 0)
14653     tmp -= (off_val & 0xfffff);
14654   else
14655     tmp += ((- off_val) & 0xfffff);
14656
14657   *pri = tmp;
14658   return;
14659 }
14660
14661 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14662    Adjust priority of sha1h instructions so they are scheduled before
14663    other SHA1 instructions.  */
14664
14665 static int
14666 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14667 {
14668   rtx x = PATTERN (insn);
14669
14670   if (GET_CODE (x) == SET)
14671     {
14672       x = SET_SRC (x);
14673
14674       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14675         return priority + 10;
14676     }
14677
14678   return priority;
14679 }
14680
14681 /* Given OPERANDS of consecutive load/store, check if we can merge
14682    them into ldp/stp.  LOAD is true if they are load instructions.
14683    MODE is the mode of memory operands.  */
14684
14685 bool
14686 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14687                                 machine_mode mode)
14688 {
14689   HOST_WIDE_INT offval_1, offval_2, msize;
14690   enum reg_class rclass_1, rclass_2;
14691   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14692
14693   if (load)
14694     {
14695       mem_1 = operands[1];
14696       mem_2 = operands[3];
14697       reg_1 = operands[0];
14698       reg_2 = operands[2];
14699       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14700       if (REGNO (reg_1) == REGNO (reg_2))
14701         return false;
14702     }
14703   else
14704     {
14705       mem_1 = operands[0];
14706       mem_2 = operands[2];
14707       reg_1 = operands[1];
14708       reg_2 = operands[3];
14709     }
14710
14711   /* The mems cannot be volatile.  */
14712   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14713     return false;
14714
14715   /* If we have SImode and slow unaligned ldp,
14716      check the alignment to be at least 8 byte. */
14717   if (mode == SImode
14718       && (aarch64_tune_params.extra_tuning_flags
14719           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14720       && !optimize_size
14721       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14722     return false;
14723
14724   /* Check if the addresses are in the form of [base+offset].  */
14725   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14726   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14727     return false;
14728   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14729   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14730     return false;
14731
14732   /* Check if the bases are same.  */
14733   if (!rtx_equal_p (base_1, base_2))
14734     return false;
14735
14736   offval_1 = INTVAL (offset_1);
14737   offval_2 = INTVAL (offset_2);
14738   msize = GET_MODE_SIZE (mode);
14739   /* Check if the offsets are consecutive.  */
14740   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14741     return false;
14742
14743   /* Check if the addresses are clobbered by load.  */
14744   if (load)
14745     {
14746       if (reg_mentioned_p (reg_1, mem_1))
14747         return false;
14748
14749       /* In increasing order, the last load can clobber the address.  */
14750       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14751       return false;
14752     }
14753
14754   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14755     rclass_1 = FP_REGS;
14756   else
14757     rclass_1 = GENERAL_REGS;
14758
14759   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14760     rclass_2 = FP_REGS;
14761   else
14762     rclass_2 = GENERAL_REGS;
14763
14764   /* Check if the registers are of same class.  */
14765   if (rclass_1 != rclass_2)
14766     return false;
14767
14768   return true;
14769 }
14770
14771 /* Given OPERANDS of consecutive load/store, check if we can merge
14772    them into ldp/stp by adjusting the offset.  LOAD is true if they
14773    are load instructions.  MODE is the mode of memory operands.
14774
14775    Given below consecutive stores:
14776
14777      str  w1, [xb, 0x100]
14778      str  w1, [xb, 0x104]
14779      str  w1, [xb, 0x108]
14780      str  w1, [xb, 0x10c]
14781
14782    Though the offsets are out of the range supported by stp, we can
14783    still pair them after adjusting the offset, like:
14784
14785      add  scratch, xb, 0x100
14786      stp  w1, w1, [scratch]
14787      stp  w1, w1, [scratch, 0x8]
14788
14789    The peephole patterns detecting this opportunity should guarantee
14790    the scratch register is avaliable.  */
14791
14792 bool
14793 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14794                                        machine_mode mode)
14795 {
14796   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14797   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14798   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14799   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14800
14801   if (load)
14802     {
14803       reg_1 = operands[0];
14804       mem_1 = operands[1];
14805       reg_2 = operands[2];
14806       mem_2 = operands[3];
14807       reg_3 = operands[4];
14808       mem_3 = operands[5];
14809       reg_4 = operands[6];
14810       mem_4 = operands[7];
14811       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14812                   && REG_P (reg_3) && REG_P (reg_4));
14813       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14814         return false;
14815     }
14816   else
14817     {
14818       mem_1 = operands[0];
14819       reg_1 = operands[1];
14820       mem_2 = operands[2];
14821       reg_2 = operands[3];
14822       mem_3 = operands[4];
14823       reg_3 = operands[5];
14824       mem_4 = operands[6];
14825       reg_4 = operands[7];
14826     }
14827   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14828   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14829     return false;
14830
14831   /* The mems cannot be volatile.  */
14832   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14833       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14834     return false;
14835
14836   /* Check if the addresses are in the form of [base+offset].  */
14837   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14838   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14839     return false;
14840   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14841   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14842     return false;
14843   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14844   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14845     return false;
14846   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14847   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14848     return false;
14849
14850   /* Check if the bases are same.  */
14851   if (!rtx_equal_p (base_1, base_2)
14852       || !rtx_equal_p (base_2, base_3)
14853       || !rtx_equal_p (base_3, base_4))
14854     return false;
14855
14856   offval_1 = INTVAL (offset_1);
14857   offval_2 = INTVAL (offset_2);
14858   offval_3 = INTVAL (offset_3);
14859   offval_4 = INTVAL (offset_4);
14860   msize = GET_MODE_SIZE (mode);
14861   /* Check if the offsets are consecutive.  */
14862   if ((offval_1 != (offval_2 + msize)
14863        || offval_1 != (offval_3 + msize * 2)
14864        || offval_1 != (offval_4 + msize * 3))
14865       && (offval_4 != (offval_3 + msize)
14866           || offval_4 != (offval_2 + msize * 2)
14867           || offval_4 != (offval_1 + msize * 3)))
14868     return false;
14869
14870   /* Check if the addresses are clobbered by load.  */
14871   if (load)
14872     {
14873       if (reg_mentioned_p (reg_1, mem_1)
14874           || reg_mentioned_p (reg_2, mem_2)
14875           || reg_mentioned_p (reg_3, mem_3))
14876         return false;
14877
14878       /* In increasing order, the last load can clobber the address.  */
14879       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14880         return false;
14881     }
14882
14883   /* If we have SImode and slow unaligned ldp,
14884      check the alignment to be at least 8 byte. */
14885   if (mode == SImode
14886       && (aarch64_tune_params.extra_tuning_flags
14887           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14888       && !optimize_size
14889       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14890     return false;
14891
14892   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14893     rclass_1 = FP_REGS;
14894   else
14895     rclass_1 = GENERAL_REGS;
14896
14897   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14898     rclass_2 = FP_REGS;
14899   else
14900     rclass_2 = GENERAL_REGS;
14901
14902   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14903     rclass_3 = FP_REGS;
14904   else
14905     rclass_3 = GENERAL_REGS;
14906
14907   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14908     rclass_4 = FP_REGS;
14909   else
14910     rclass_4 = GENERAL_REGS;
14911
14912   /* Check if the registers are of same class.  */
14913   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14914     return false;
14915
14916   return true;
14917 }
14918
14919 /* Given OPERANDS of consecutive load/store, this function pairs them
14920    into ldp/stp after adjusting the offset.  It depends on the fact
14921    that addresses of load/store instructions are in increasing order.
14922    MODE is the mode of memory operands.  CODE is the rtl operator
14923    which should be applied to all memory operands, it's SIGN_EXTEND,
14924    ZERO_EXTEND or UNKNOWN.  */
14925
14926 bool
14927 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14928                              machine_mode mode, RTX_CODE code)
14929 {
14930   rtx base, offset, t1, t2;
14931   rtx mem_1, mem_2, mem_3, mem_4;
14932   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14933
14934   if (load)
14935     {
14936       mem_1 = operands[1];
14937       mem_2 = operands[3];
14938       mem_3 = operands[5];
14939       mem_4 = operands[7];
14940     }
14941   else
14942     {
14943       mem_1 = operands[0];
14944       mem_2 = operands[2];
14945       mem_3 = operands[4];
14946       mem_4 = operands[6];
14947       gcc_assert (code == UNKNOWN);
14948     }
14949
14950   extract_base_offset_in_addr (mem_1, &base, &offset);
14951   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14952
14953   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14954   msize = GET_MODE_SIZE (mode);
14955   stp_off_limit = msize * 0x40;
14956   off_val = INTVAL (offset);
14957   abs_off = (off_val < 0) ? -off_val : off_val;
14958   new_off = abs_off % stp_off_limit;
14959   adj_off = abs_off - new_off;
14960
14961   /* Further adjust to make sure all offsets are OK.  */
14962   if ((new_off + msize * 2) >= stp_off_limit)
14963     {
14964       adj_off += stp_off_limit;
14965       new_off -= stp_off_limit;
14966     }
14967
14968   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14969   if (adj_off >= 0x1000)
14970     return false;
14971
14972   if (off_val < 0)
14973     {
14974       adj_off = -adj_off;
14975       new_off = -new_off;
14976     }
14977
14978   /* Create new memory references.  */
14979   mem_1 = change_address (mem_1, VOIDmode,
14980                           plus_constant (DImode, operands[8], new_off));
14981
14982   /* Check if the adjusted address is OK for ldp/stp.  */
14983   if (!aarch64_mem_pair_operand (mem_1, mode))
14984     return false;
14985
14986   msize = GET_MODE_SIZE (mode);
14987   mem_2 = change_address (mem_2, VOIDmode,
14988                           plus_constant (DImode,
14989                                          operands[8],
14990                                          new_off + msize));
14991   mem_3 = change_address (mem_3, VOIDmode,
14992                           plus_constant (DImode,
14993                                          operands[8],
14994                                          new_off + msize * 2));
14995   mem_4 = change_address (mem_4, VOIDmode,
14996                           plus_constant (DImode,
14997                                          operands[8],
14998                                          new_off + msize * 3));
14999
15000   if (code == ZERO_EXTEND)
15001     {
15002       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15003       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15004       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15005       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15006     }
15007   else if (code == SIGN_EXTEND)
15008     {
15009       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15010       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15011       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15012       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15013     }
15014
15015   if (load)
15016     {
15017       operands[1] = mem_1;
15018       operands[3] = mem_2;
15019       operands[5] = mem_3;
15020       operands[7] = mem_4;
15021     }
15022   else
15023     {
15024       operands[0] = mem_1;
15025       operands[2] = mem_2;
15026       operands[4] = mem_3;
15027       operands[6] = mem_4;
15028     }
15029
15030   /* Emit adjusting instruction.  */
15031   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15032   /* Emit ldp/stp instructions.  */
15033   t1 = gen_rtx_SET (operands[0], operands[1]);
15034   t2 = gen_rtx_SET (operands[2], operands[3]);
15035   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15036   t1 = gen_rtx_SET (operands[4], operands[5]);
15037   t2 = gen_rtx_SET (operands[6], operands[7]);
15038   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15039   return true;
15040 }
15041
15042 /* Return 1 if pseudo register should be created and used to hold
15043    GOT address for PIC code.  */
15044
15045 bool
15046 aarch64_use_pseudo_pic_reg (void)
15047 {
15048   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15049 }
15050
15051 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15052
15053 static int
15054 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15055 {
15056   switch (XINT (x, 1))
15057     {
15058     case UNSPEC_GOTSMALLPIC:
15059     case UNSPEC_GOTSMALLPIC28K:
15060     case UNSPEC_GOTTINYPIC:
15061       return 0;
15062     default:
15063       break;
15064     }
15065
15066   return default_unspec_may_trap_p (x, flags);
15067 }
15068
15069
15070 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15071    return the log2 of that value.  Otherwise return -1.  */
15072
15073 int
15074 aarch64_fpconst_pow_of_2 (rtx x)
15075 {
15076   const REAL_VALUE_TYPE *r;
15077
15078   if (!CONST_DOUBLE_P (x))
15079     return -1;
15080
15081   r = CONST_DOUBLE_REAL_VALUE (x);
15082
15083   if (REAL_VALUE_NEGATIVE (*r)
15084       || REAL_VALUE_ISNAN (*r)
15085       || REAL_VALUE_ISINF (*r)
15086       || !real_isinteger (r, DFmode))
15087     return -1;
15088
15089   return exact_log2 (real_to_integer (r));
15090 }
15091
15092 /* If X is a vector of equal CONST_DOUBLE values and that value is
15093    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15094
15095 int
15096 aarch64_vec_fpconst_pow_of_2 (rtx x)
15097 {
15098   if (GET_CODE (x) != CONST_VECTOR)
15099     return -1;
15100
15101   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15102     return -1;
15103
15104   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15105   if (firstval <= 0)
15106     return -1;
15107
15108   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15109     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15110       return -1;
15111
15112   return firstval;
15113 }
15114
15115 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15116    to float.
15117
15118    __fp16 always promotes through this hook.
15119    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15120    through the generic excess precision logic rather than here.  */
15121
15122 static tree
15123 aarch64_promoted_type (const_tree t)
15124 {
15125   if (SCALAR_FLOAT_TYPE_P (t)
15126       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15127     return float_type_node;
15128
15129   return NULL_TREE;
15130 }
15131
15132 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15133
15134 static bool
15135 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15136                            optimization_type opt_type)
15137 {
15138   switch (op)
15139     {
15140     case rsqrt_optab:
15141       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15142
15143     default:
15144       return true;
15145     }
15146 }
15147
15148 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15149    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15150
15151 static bool
15152 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15153 {
15154   return (mode == HFmode
15155           ? true
15156           : default_libgcc_floating_mode_supported_p (mode));
15157 }
15158
15159 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15160    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15161
15162 static bool
15163 aarch64_scalar_mode_supported_p (scalar_mode mode)
15164 {
15165   return (mode == HFmode
15166           ? true
15167           : default_scalar_mode_supported_p (mode));
15168 }
15169
15170 /* Set the value of FLT_EVAL_METHOD.
15171    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15172
15173     0: evaluate all operations and constants, whose semantic type has at
15174        most the range and precision of type float, to the range and
15175        precision of float; evaluate all other operations and constants to
15176        the range and precision of the semantic type;
15177
15178     N, where _FloatN is a supported interchange floating type
15179        evaluate all operations and constants, whose semantic type has at
15180        most the range and precision of _FloatN type, to the range and
15181        precision of the _FloatN type; evaluate all other operations and
15182        constants to the range and precision of the semantic type;
15183
15184    If we have the ARMv8.2-A extensions then we support _Float16 in native
15185    precision, so we should set this to 16.  Otherwise, we support the type,
15186    but want to evaluate expressions in float precision, so set this to
15187    0.  */
15188
15189 static enum flt_eval_method
15190 aarch64_excess_precision (enum excess_precision_type type)
15191 {
15192   switch (type)
15193     {
15194       case EXCESS_PRECISION_TYPE_FAST:
15195       case EXCESS_PRECISION_TYPE_STANDARD:
15196         /* We can calculate either in 16-bit range and precision or
15197            32-bit range and precision.  Make that decision based on whether
15198            we have native support for the ARMv8.2-A 16-bit floating-point
15199            instructions or not.  */
15200         return (TARGET_FP_F16INST
15201                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15202                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15203       case EXCESS_PRECISION_TYPE_IMPLICIT:
15204         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15205       default:
15206         gcc_unreachable ();
15207     }
15208   return FLT_EVAL_METHOD_UNPREDICTABLE;
15209 }
15210
15211 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15212    scheduled for speculative execution.  Reject the long-running division
15213    and square-root instructions.  */
15214
15215 static bool
15216 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15217 {
15218   switch (get_attr_type (insn))
15219     {
15220       case TYPE_SDIV:
15221       case TYPE_UDIV:
15222       case TYPE_FDIVS:
15223       case TYPE_FDIVD:
15224       case TYPE_FSQRTS:
15225       case TYPE_FSQRTD:
15226       case TYPE_NEON_FP_SQRT_S:
15227       case TYPE_NEON_FP_SQRT_D:
15228       case TYPE_NEON_FP_SQRT_S_Q:
15229       case TYPE_NEON_FP_SQRT_D_Q:
15230       case TYPE_NEON_FP_DIV_S:
15231       case TYPE_NEON_FP_DIV_D:
15232       case TYPE_NEON_FP_DIV_S_Q:
15233       case TYPE_NEON_FP_DIV_D_Q:
15234         return false;
15235       default:
15236         return true;
15237     }
15238 }
15239
15240 /* Target-specific selftests.  */
15241
15242 #if CHECKING_P
15243
15244 namespace selftest {
15245
15246 /* Selftest for the RTL loader.
15247    Verify that the RTL loader copes with a dump from
15248    print_rtx_function.  This is essentially just a test that class
15249    function_reader can handle a real dump, but it also verifies
15250    that lookup_reg_by_dump_name correctly handles hard regs.
15251    The presence of hard reg names in the dump means that the test is
15252    target-specific, hence it is in this file.  */
15253
15254 static void
15255 aarch64_test_loading_full_dump ()
15256 {
15257   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15258
15259   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15260
15261   rtx_insn *insn_1 = get_insn_by_uid (1);
15262   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15263
15264   rtx_insn *insn_15 = get_insn_by_uid (15);
15265   ASSERT_EQ (INSN, GET_CODE (insn_15));
15266   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15267
15268   /* Verify crtl->return_rtx.  */
15269   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15270   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15271   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15272 }
15273
15274 /* Run all target-specific selftests.  */
15275
15276 static void
15277 aarch64_run_selftests (void)
15278 {
15279   aarch64_test_loading_full_dump ();
15280 }
15281
15282 } // namespace selftest
15283
15284 #endif /* #if CHECKING_P */
15285
15286 #undef TARGET_ADDRESS_COST
15287 #define TARGET_ADDRESS_COST aarch64_address_cost
15288
15289 /* This hook will determines whether unnamed bitfields affect the alignment
15290    of the containing structure.  The hook returns true if the structure
15291    should inherit the alignment requirements of an unnamed bitfield's
15292    type.  */
15293 #undef TARGET_ALIGN_ANON_BITFIELD
15294 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15295
15296 #undef TARGET_ASM_ALIGNED_DI_OP
15297 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15298
15299 #undef TARGET_ASM_ALIGNED_HI_OP
15300 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15301
15302 #undef TARGET_ASM_ALIGNED_SI_OP
15303 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15304
15305 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15306 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15307   hook_bool_const_tree_hwi_hwi_const_tree_true
15308
15309 #undef TARGET_ASM_FILE_START
15310 #define TARGET_ASM_FILE_START aarch64_start_file
15311
15312 #undef TARGET_ASM_OUTPUT_MI_THUNK
15313 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15314
15315 #undef TARGET_ASM_SELECT_RTX_SECTION
15316 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15317
15318 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15319 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15320
15321 #undef TARGET_BUILD_BUILTIN_VA_LIST
15322 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15323
15324 #undef TARGET_CALLEE_COPIES
15325 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15326
15327 #undef TARGET_CAN_ELIMINATE
15328 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15329
15330 #undef TARGET_CAN_INLINE_P
15331 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15332
15333 #undef TARGET_CANNOT_FORCE_CONST_MEM
15334 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15335
15336 #undef TARGET_CASE_VALUES_THRESHOLD
15337 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15338
15339 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15340 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15341
15342 /* Only the least significant bit is used for initialization guard
15343    variables.  */
15344 #undef TARGET_CXX_GUARD_MASK_BIT
15345 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15346
15347 #undef TARGET_C_MODE_FOR_SUFFIX
15348 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15349
15350 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15351 #undef  TARGET_DEFAULT_TARGET_FLAGS
15352 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15353 #endif
15354
15355 #undef TARGET_CLASS_MAX_NREGS
15356 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15357
15358 #undef TARGET_BUILTIN_DECL
15359 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15360
15361 #undef TARGET_BUILTIN_RECIPROCAL
15362 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15363
15364 #undef TARGET_C_EXCESS_PRECISION
15365 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15366
15367 #undef  TARGET_EXPAND_BUILTIN
15368 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15369
15370 #undef TARGET_EXPAND_BUILTIN_VA_START
15371 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15372
15373 #undef TARGET_FOLD_BUILTIN
15374 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15375
15376 #undef TARGET_FUNCTION_ARG
15377 #define TARGET_FUNCTION_ARG aarch64_function_arg
15378
15379 #undef TARGET_FUNCTION_ARG_ADVANCE
15380 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15381
15382 #undef TARGET_FUNCTION_ARG_BOUNDARY
15383 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15384
15385 #undef TARGET_FUNCTION_ARG_PADDING
15386 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15387
15388 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15389 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15390
15391 #undef TARGET_FUNCTION_VALUE
15392 #define TARGET_FUNCTION_VALUE aarch64_function_value
15393
15394 #undef TARGET_FUNCTION_VALUE_REGNO_P
15395 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15396
15397 #undef TARGET_FRAME_POINTER_REQUIRED
15398 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15399
15400 #undef TARGET_GIMPLE_FOLD_BUILTIN
15401 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15402
15403 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15404 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15405
15406 #undef  TARGET_INIT_BUILTINS
15407 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15408
15409 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15410 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15411   aarch64_ira_change_pseudo_allocno_class
15412
15413 #undef TARGET_LEGITIMATE_ADDRESS_P
15414 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15415
15416 #undef TARGET_LEGITIMATE_CONSTANT_P
15417 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15418
15419 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15420 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15421   aarch64_legitimize_address_displacement
15422
15423 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15424 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15425
15426 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15427 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15428 aarch64_libgcc_floating_mode_supported_p
15429
15430 #undef TARGET_MANGLE_TYPE
15431 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15432
15433 #undef TARGET_MEMORY_MOVE_COST
15434 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15435
15436 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15437 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15438
15439 #undef TARGET_MUST_PASS_IN_STACK
15440 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15441
15442 /* This target hook should return true if accesses to volatile bitfields
15443    should use the narrowest mode possible.  It should return false if these
15444    accesses should use the bitfield container type.  */
15445 #undef TARGET_NARROW_VOLATILE_BITFIELD
15446 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15447
15448 #undef  TARGET_OPTION_OVERRIDE
15449 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15450
15451 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15452 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15453   aarch64_override_options_after_change
15454
15455 #undef TARGET_OPTION_SAVE
15456 #define TARGET_OPTION_SAVE aarch64_option_save
15457
15458 #undef TARGET_OPTION_RESTORE
15459 #define TARGET_OPTION_RESTORE aarch64_option_restore
15460
15461 #undef TARGET_OPTION_PRINT
15462 #define TARGET_OPTION_PRINT aarch64_option_print
15463
15464 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15465 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15466
15467 #undef TARGET_SET_CURRENT_FUNCTION
15468 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15469
15470 #undef TARGET_PASS_BY_REFERENCE
15471 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15472
15473 #undef TARGET_PREFERRED_RELOAD_CLASS
15474 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15475
15476 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15477 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15478
15479 #undef TARGET_PROMOTED_TYPE
15480 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15481
15482 #undef TARGET_SECONDARY_RELOAD
15483 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15484
15485 #undef TARGET_SHIFT_TRUNCATION_MASK
15486 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15487
15488 #undef TARGET_SETUP_INCOMING_VARARGS
15489 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15490
15491 #undef TARGET_STRUCT_VALUE_RTX
15492 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15493
15494 #undef TARGET_REGISTER_MOVE_COST
15495 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15496
15497 #undef TARGET_RETURN_IN_MEMORY
15498 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15499
15500 #undef TARGET_RETURN_IN_MSB
15501 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15502
15503 #undef TARGET_RTX_COSTS
15504 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15505
15506 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15507 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15508
15509 #undef TARGET_SCHED_ISSUE_RATE
15510 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15511
15512 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15513 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15514   aarch64_sched_first_cycle_multipass_dfa_lookahead
15515
15516 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15517 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15518   aarch64_first_cycle_multipass_dfa_lookahead_guard
15519
15520 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15521 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15522   aarch64_get_separate_components
15523
15524 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15525 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15526   aarch64_components_for_bb
15527
15528 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15529 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15530   aarch64_disqualify_components
15531
15532 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15533 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15534   aarch64_emit_prologue_components
15535
15536 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15537 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15538   aarch64_emit_epilogue_components
15539
15540 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15541 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15542   aarch64_set_handled_components
15543
15544 #undef TARGET_TRAMPOLINE_INIT
15545 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15546
15547 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15548 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15549
15550 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15551 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15552
15553 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15554 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15555   aarch64_builtin_support_vector_misalignment
15556
15557 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15558 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15559
15560 #undef TARGET_VECTORIZE_ADD_STMT_COST
15561 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15562
15563 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15564 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15565   aarch64_builtin_vectorization_cost
15566
15567 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15568 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15569
15570 #undef TARGET_VECTORIZE_BUILTINS
15571 #define TARGET_VECTORIZE_BUILTINS
15572
15573 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15574 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15575   aarch64_builtin_vectorized_function
15576
15577 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15578 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15579   aarch64_autovectorize_vector_sizes
15580
15581 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15582 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15583   aarch64_atomic_assign_expand_fenv
15584
15585 /* Section anchor support.  */
15586
15587 #undef TARGET_MIN_ANCHOR_OFFSET
15588 #define TARGET_MIN_ANCHOR_OFFSET -256
15589
15590 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15591    byte offset; we can do much more for larger data types, but have no way
15592    to determine the size of the access.  We assume accesses are aligned.  */
15593 #undef TARGET_MAX_ANCHOR_OFFSET
15594 #define TARGET_MAX_ANCHOR_OFFSET 4095
15595
15596 #undef TARGET_VECTOR_ALIGNMENT
15597 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15598
15599 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15600 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15601   aarch64_simd_vector_alignment_reachable
15602
15603 /* vec_perm support.  */
15604
15605 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15606 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15607   aarch64_vectorize_vec_perm_const_ok
15608
15609 #undef TARGET_INIT_LIBFUNCS
15610 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15611
15612 #undef TARGET_FIXED_CONDITION_CODE_REGS
15613 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15614
15615 #undef TARGET_FLAGS_REGNUM
15616 #define TARGET_FLAGS_REGNUM CC_REGNUM
15617
15618 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15619 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15620
15621 #undef TARGET_ASAN_SHADOW_OFFSET
15622 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15623
15624 #undef TARGET_LEGITIMIZE_ADDRESS
15625 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15626
15627 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15628 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15629   aarch64_use_by_pieces_infrastructure_p
15630
15631 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15632 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15633
15634 #undef TARGET_CAN_USE_DOLOOP_P
15635 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15636
15637 #undef TARGET_SCHED_ADJUST_PRIORITY
15638 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15639
15640 #undef TARGET_SCHED_MACRO_FUSION_P
15641 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15642
15643 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15644 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15645
15646 #undef TARGET_SCHED_FUSION_PRIORITY
15647 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15648
15649 #undef TARGET_UNSPEC_MAY_TRAP_P
15650 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15651
15652 #undef TARGET_USE_PSEUDO_PIC_REG
15653 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15654
15655 #undef TARGET_PRINT_OPERAND
15656 #define TARGET_PRINT_OPERAND aarch64_print_operand
15657
15658 #undef TARGET_PRINT_OPERAND_ADDRESS
15659 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15660
15661 #undef TARGET_OPTAB_SUPPORTED_P
15662 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15663
15664 #undef TARGET_OMIT_STRUCT_RETURN_REG
15665 #define TARGET_OMIT_STRUCT_RETURN_REG true
15666
15667 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15668 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15669 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15670
15671 #undef TARGET_HARD_REGNO_MODE_OK
15672 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15673
15674 #undef TARGET_MODES_TIEABLE_P
15675 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15676
15677 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15678 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15679   aarch64_hard_regno_call_part_clobbered
15680
15681 #if CHECKING_P
15682 #undef TARGET_RUN_TARGET_SELFTESTS
15683 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15684 #endif /* #if CHECKING_P */
15685
15686 struct gcc_target targetm = TARGET_INITIALIZER;
15687
15688 #include "gt-aarch64.h"