gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 static const struct tune_params thunderx2t99_tunings =
 826 {
 827   &thunderx2t99_extra_costs,
 828   &thunderx2t99_addrcost_table,
 829   &thunderx2t99_regmove_cost,
 830   &thunderx2t99_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   4, /* memmov_cost.  */
 834   4, /* issue_rate.  */
 835   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 836    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 837   16,   /* function_align.  */
 838   8,    /* jump_align.  */
 839   16,   /* loop_align.  */
 840   3,    /* int_reassoc_width.  */
 841   2,    /* fp_reassoc_width.  */
 842   2,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &thunderx2t99_prefetch_tune
 849 };
 850
 851 /* Support for fine-grained override of the tuning structures.  */
 852 struct aarch64_tuning_override_function
 853 {
 854   const char* name;
 855   void (*parse_override)(const char*, struct tune_params*);
 856 };
 857
 858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 860
 861 static const struct aarch64_tuning_override_function
 862 aarch64_tuning_override_functions[] =
 863 {
 864   { "fuse", aarch64_parse_fuse_string },
 865   { "tune", aarch64_parse_tune_string },
 866   { NULL, NULL }
 867 };
 868
 869 /* A processor implementing AArch64.  */
 870 struct processor
 871 {
 872   const char *const name;
 873   enum aarch64_processor ident;
 874   enum aarch64_processor sched_core;
 875   enum aarch64_arch arch;
 876   unsigned architecture_version;
 877   const unsigned long flags;
 878   const struct tune_params *const tune;
 879 };
 880
 881 /* Architectures implementing AArch64.  */
 882 static const struct processor all_architectures[] =
 883 {
 884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 885   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 886 #include "aarch64-arches.def"
 887   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 888 };
 889
 890 /* Processor cores implementing AArch64.  */
 891 static const struct processor all_cores[] =
 892 {
 893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 894   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 895   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 896   FLAGS, &COSTS##_tunings},
 897 #include "aarch64-cores.def"
 898   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 899     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 900   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 901 };
 902
 903
 904 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 905    handling code or by target attributes.  */
 906 static const struct processor *selected_arch;
 907 static const struct processor *selected_cpu;
 908 static const struct processor *selected_tune;
 909
 910 /* The current tuning set.  */
 911 struct tune_params aarch64_tune_params = generic_tunings;
 912
 913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 914
 915 /* An ISA extension in the co-processor and main instruction set space.  */
 916 struct aarch64_option_extension
 917 {
 918   const char *const name;
 919   const unsigned long flags_on;
 920   const unsigned long flags_off;
 921 };
 922
 923 typedef enum aarch64_cond_code
 924 {
 925   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 926   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 927   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 928 }
 929 aarch64_cc;
 930
 931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 932
 933 /* The condition codes of the processor, and the inverse function.  */
 934 static const char * const aarch64_condition_codes[] =
 935 {
 936   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 937   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 938 };
 939
 940 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 941 const char *
 942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 943                         const char * branch_format)
 944 {
 945     rtx_code_label * tmp_label = gen_label_rtx ();
 946     char label_buf[256];
 947     char buffer[128];
 948     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 949                                  CODE_LABEL_NUMBER (tmp_label));
 950     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 951     rtx dest_label = operands[pos_label];
 952     operands[pos_label] = tmp_label;
 953
 954     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 955     output_asm_insn (buffer, operands);
 956
 957     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 958     operands[pos_label] = dest_label;
 959     output_asm_insn (buffer, operands);
 960     return "";
 961 }
 962
 963 void
 964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 965 {
 966   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 967   if (TARGET_GENERAL_REGS_ONLY)
 968     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 969   else
 970     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 971 }
 972
 973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 974    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 975    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 976    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 977    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 978    irrespectively of its cost results in bad allocations with many redundant
 979    int<->FP moves which are expensive on various cores.
 980    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 981    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 982    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 983    Otherwise set the allocno class depending on the mode.
 984    The result of this is that it is no longer inefficient to have a higher
 985    memory move cost than the register move cost.
 986 */
 987
 988 static reg_class_t
 989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 990                                          reg_class_t best_class)
 991 {
 992   machine_mode mode;
 993
 994   if (allocno_class != ALL_REGS)
 995     return allocno_class;
 996
 997   if (best_class != ALL_REGS)
 998     return best_class;
 999
1000   mode = PSEUDO_REGNO_MODE (regno);
1001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007   if (GET_MODE_UNIT_SIZE (mode) == 4)
1008     return aarch64_tune_params.min_div_recip_mul_sf;
1009   return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014                              machine_mode mode)
1015 {
1016   if (VECTOR_MODE_P (mode))
1017     return aarch64_tune_params.vec_reassoc_width;
1018   if (INTEGRAL_MODE_P (mode))
1019     return aarch64_tune_params.int_reassoc_width;
1020   if (FLOAT_MODE_P (mode))
1021     return aarch64_tune_params.fp_reassoc_width;
1022   return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029    if (GP_REGNUM_P (regno))
1030      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031    else if (regno == SP_REGNUM)
1032      return AARCH64_DWARF_SP;
1033    else if (FP_REGNUM_P (regno))
1034      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037       equivalent DWARF register.  */
1038    return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes.  */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045   return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes.  */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052   return aarch64_vector_mode_supported_p (mode)
1053          || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059                                 unsigned HOST_WIDE_INT nelems)
1060 {
1061   if (TARGET_SIMD
1062       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064       && (nelems >= 2 && nelems <= 4))
1065     return true;
1066
1067   return false;
1068 }
1069
1070 /* Implement HARD_REGNO_NREGS.  */
1071
1072 int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075   switch (aarch64_regno_regclass (regno))
1076     {
1077     case FP_REGS:
1078     case FP_LO_REGS:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080     default:
1081       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082     }
1083   gcc_unreachable ();
1084 }
1085
1086 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1087
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091   if (GET_MODE_CLASS (mode) == MODE_CC)
1092     return regno == CC_REGNUM;
1093
1094   if (regno == SP_REGNUM)
1095     /* The purpose of comparing with ptr_mode is to support the
1096        global register variable associated with the stack pointer
1097        register via the syntax of asm ("wsp") in ILP32.  */
1098     return mode == Pmode || mode == ptr_mode;
1099
1100   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101     return mode == Pmode;
1102
1103   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104     return true;
1105
1106   if (FP_REGNUM_P (regno))
1107     {
1108       if (aarch64_vect_struct_mode_p (mode))
1109         return
1110           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111       else
1112         return true;
1113     }
1114
1115   return false;
1116 }
1117
1118 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1119    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1120    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1121
1122 static bool
1123 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1124 {
1125   return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1126 }
1127
1128 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1129 machine_mode
1130 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1131                                      machine_mode mode)
1132 {
1133   /* Handle modes that fit within single registers.  */
1134   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1135     {
1136       if (GET_MODE_SIZE (mode) >= 4)
1137         return mode;
1138       else
1139         return SImode;
1140     }
1141   /* Fall back to generic for multi-reg and very large modes.  */
1142   else
1143     return choose_hard_reg_mode (regno, nregs, false);
1144 }
1145
1146 /* Return true if calls to DECL should be treated as
1147    long-calls (ie called via a register).  */
1148 static bool
1149 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1150 {
1151   return false;
1152 }
1153
1154 /* Return true if calls to symbol-ref SYM should be treated as
1155    long-calls (ie called via a register).  */
1156 bool
1157 aarch64_is_long_call_p (rtx sym)
1158 {
1159   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1160 }
1161
1162 /* Return true if calls to symbol-ref SYM should not go through
1163    plt stubs.  */
1164
1165 bool
1166 aarch64_is_noplt_call_p (rtx sym)
1167 {
1168   const_tree decl = SYMBOL_REF_DECL (sym);
1169
1170   if (flag_pic
1171       && decl
1172       && (!flag_plt
1173           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1174       && !targetm.binds_local_p (decl))
1175     return true;
1176
1177   return false;
1178 }
1179
1180 /* Return true if the offsets to a zero/sign-extract operation
1181    represent an expression that matches an extend operation.  The
1182    operands represent the paramters from
1183
1184    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1185 bool
1186 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1187                                 rtx extract_imm)
1188 {
1189   HOST_WIDE_INT mult_val, extract_val;
1190
1191   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1192     return false;
1193
1194   mult_val = INTVAL (mult_imm);
1195   extract_val = INTVAL (extract_imm);
1196
1197   if (extract_val > 8
1198       && extract_val < GET_MODE_BITSIZE (mode)
1199       && exact_log2 (extract_val & ~7) > 0
1200       && (extract_val & 7) <= 4
1201       && mult_val == (1 << (extract_val & 7)))
1202     return true;
1203
1204   return false;
1205 }
1206
1207 /* Emit an insn that's a simple single-set.  Both the operands must be
1208    known to be valid.  */
1209 inline static rtx_insn *
1210 emit_set_insn (rtx x, rtx y)
1211 {
1212   return emit_insn (gen_rtx_SET (x, y));
1213 }
1214
1215 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1216    return the rtx for register 0 in the proper mode.  */
1217 rtx
1218 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1219 {
1220   machine_mode mode = SELECT_CC_MODE (code, x, y);
1221   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1222
1223   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1224   return cc_reg;
1225 }
1226
1227 /* Build the SYMBOL_REF for __tls_get_addr.  */
1228
1229 static GTY(()) rtx tls_get_addr_libfunc;
1230
1231 rtx
1232 aarch64_tls_get_addr (void)
1233 {
1234   if (!tls_get_addr_libfunc)
1235     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1236   return tls_get_addr_libfunc;
1237 }
1238
1239 /* Return the TLS model to use for ADDR.  */
1240
1241 static enum tls_model
1242 tls_symbolic_operand_type (rtx addr)
1243 {
1244   enum tls_model tls_kind = TLS_MODEL_NONE;
1245   rtx sym, addend;
1246
1247   if (GET_CODE (addr) == CONST)
1248     {
1249       split_const (addr, &sym, &addend);
1250       if (GET_CODE (sym) == SYMBOL_REF)
1251         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1252     }
1253   else if (GET_CODE (addr) == SYMBOL_REF)
1254     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1255
1256   return tls_kind;
1257 }
1258
1259 /* We'll allow lo_sum's in addresses in our legitimate addresses
1260    so that combine would take care of combining addresses where
1261    necessary, but for generation purposes, we'll generate the address
1262    as :
1263    RTL                               Absolute
1264    tmp = hi (symbol_ref);            adrp  x1, foo
1265    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1266                                      nop
1267
1268    PIC                               TLS
1269    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1270    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1271                                      bl   __tls_get_addr
1272                                      nop
1273
1274    Load TLS symbol, depending on TLS mechanism and TLS access model.
1275
1276    Global Dynamic - Traditional TLS:
1277    adrp tmp, :tlsgd:imm
1278    add  dest, tmp, #:tlsgd_lo12:imm
1279    bl   __tls_get_addr
1280
1281    Global Dynamic - TLS Descriptors:
1282    adrp dest, :tlsdesc:imm
1283    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1284    add  dest, dest, #:tlsdesc_lo12:imm
1285    blr  tmp
1286    mrs  tp, tpidr_el0
1287    add  dest, dest, tp
1288
1289    Initial Exec:
1290    mrs  tp, tpidr_el0
1291    adrp tmp, :gottprel:imm
1292    ldr  dest, [tmp, #:gottprel_lo12:imm]
1293    add  dest, dest, tp
1294
1295    Local Exec:
1296    mrs  tp, tpidr_el0
1297    add  t0, tp, #:tprel_hi12:imm, lsl #12
1298    add  t0, t0, #:tprel_lo12_nc:imm
1299 */
1300
1301 static void
1302 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1303                                    enum aarch64_symbol_type type)
1304 {
1305   switch (type)
1306     {
1307     case SYMBOL_SMALL_ABSOLUTE:
1308       {
1309         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1310         rtx tmp_reg = dest;
1311         machine_mode mode = GET_MODE (dest);
1312
1313         gcc_assert (mode == Pmode || mode == ptr_mode);
1314
1315         if (can_create_pseudo_p ())
1316           tmp_reg = gen_reg_rtx (mode);
1317
1318         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1319         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1320         return;
1321       }
1322
1323     case SYMBOL_TINY_ABSOLUTE:
1324       emit_insn (gen_rtx_SET (dest, imm));
1325       return;
1326
1327     case SYMBOL_SMALL_GOT_28K:
1328       {
1329         machine_mode mode = GET_MODE (dest);
1330         rtx gp_rtx = pic_offset_table_rtx;
1331         rtx insn;
1332         rtx mem;
1333
1334         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1335            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1336            decide rtx costs, in which case pic_offset_table_rtx is not
1337            initialized.  For that case no need to generate the first adrp
1338            instruction as the final cost for global variable access is
1339            one instruction.  */
1340         if (gp_rtx != NULL)
1341           {
1342             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1343                using the page base as GOT base, the first page may be wasted,
1344                in the worst scenario, there is only 28K space for GOT).
1345
1346                The generate instruction sequence for accessing global variable
1347                is:
1348
1349                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1350
1351                Only one instruction needed. But we must initialize
1352                pic_offset_table_rtx properly.  We generate initialize insn for
1353                every global access, and allow CSE to remove all redundant.
1354
1355                The final instruction sequences will look like the following
1356                for multiply global variables access.
1357
1358                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1359
1360                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1361                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1362                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1363                  ...  */
1364
1365             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1366             crtl->uses_pic_offset_table = 1;
1367             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1368
1369             if (mode != GET_MODE (gp_rtx))
1370              gp_rtx = gen_lowpart (mode, gp_rtx);
1371
1372           }
1373
1374         if (mode == ptr_mode)
1375           {
1376             if (mode == DImode)
1377               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1378             else
1379               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1380
1381             mem = XVECEXP (SET_SRC (insn), 0, 0);
1382           }
1383         else
1384           {
1385             gcc_assert (mode == Pmode);
1386
1387             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1388             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1389           }
1390
1391         /* The operand is expected to be MEM.  Whenever the related insn
1392            pattern changed, above code which calculate mem should be
1393            updated.  */
1394         gcc_assert (GET_CODE (mem) == MEM);
1395         MEM_READONLY_P (mem) = 1;
1396         MEM_NOTRAP_P (mem) = 1;
1397         emit_insn (insn);
1398         return;
1399       }
1400
1401     case SYMBOL_SMALL_GOT_4G:
1402       {
1403         /* In ILP32, the mode of dest can be either SImode or DImode,
1404            while the got entry is always of SImode size.  The mode of
1405            dest depends on how dest is used: if dest is assigned to a
1406            pointer (e.g. in the memory), it has SImode; it may have
1407            DImode if dest is dereferenced to access the memeory.
1408            This is why we have to handle three different ldr_got_small
1409            patterns here (two patterns for ILP32).  */
1410
1411         rtx insn;
1412         rtx mem;
1413         rtx tmp_reg = dest;
1414         machine_mode mode = GET_MODE (dest);
1415
1416         if (can_create_pseudo_p ())
1417           tmp_reg = gen_reg_rtx (mode);
1418
1419         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1420         if (mode == ptr_mode)
1421           {
1422             if (mode == DImode)
1423               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1424             else
1425               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1426
1427             mem = XVECEXP (SET_SRC (insn), 0, 0);
1428           }
1429         else
1430           {
1431             gcc_assert (mode == Pmode);
1432
1433             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1434             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1435           }
1436
1437         gcc_assert (GET_CODE (mem) == MEM);
1438         MEM_READONLY_P (mem) = 1;
1439         MEM_NOTRAP_P (mem) = 1;
1440         emit_insn (insn);
1441         return;
1442       }
1443
1444     case SYMBOL_SMALL_TLSGD:
1445       {
1446         rtx_insn *insns;
1447         machine_mode mode = GET_MODE (dest);
1448         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1449
1450         start_sequence ();
1451         if (TARGET_ILP32)
1452           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1453         else
1454           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1455         insns = get_insns ();
1456         end_sequence ();
1457
1458         RTL_CONST_CALL_P (insns) = 1;
1459         emit_libcall_block (insns, dest, result, imm);
1460         return;
1461       }
1462
1463     case SYMBOL_SMALL_TLSDESC:
1464       {
1465         machine_mode mode = GET_MODE (dest);
1466         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1467         rtx tp;
1468
1469         gcc_assert (mode == Pmode || mode == ptr_mode);
1470
1471         /* In ILP32, the got entry is always of SImode size.  Unlike
1472            small GOT, the dest is fixed at reg 0.  */
1473         if (TARGET_ILP32)
1474           emit_insn (gen_tlsdesc_small_si (imm));
1475         else
1476           emit_insn (gen_tlsdesc_small_di (imm));
1477         tp = aarch64_load_tp (NULL);
1478
1479         if (mode != Pmode)
1480           tp = gen_lowpart (mode, tp);
1481
1482         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1483         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1484         return;
1485       }
1486
1487     case SYMBOL_SMALL_TLSIE:
1488       {
1489         /* In ILP32, the mode of dest can be either SImode or DImode,
1490            while the got entry is always of SImode size.  The mode of
1491            dest depends on how dest is used: if dest is assigned to a
1492            pointer (e.g. in the memory), it has SImode; it may have
1493            DImode if dest is dereferenced to access the memeory.
1494            This is why we have to handle three different tlsie_small
1495            patterns here (two patterns for ILP32).  */
1496         machine_mode mode = GET_MODE (dest);
1497         rtx tmp_reg = gen_reg_rtx (mode);
1498         rtx tp = aarch64_load_tp (NULL);
1499
1500         if (mode == ptr_mode)
1501           {
1502             if (mode == DImode)
1503               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1504             else
1505               {
1506                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1507                 tp = gen_lowpart (mode, tp);
1508               }
1509           }
1510         else
1511           {
1512             gcc_assert (mode == Pmode);
1513             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1514           }
1515
1516         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1517         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518         return;
1519       }
1520
1521     case SYMBOL_TLSLE12:
1522     case SYMBOL_TLSLE24:
1523     case SYMBOL_TLSLE32:
1524     case SYMBOL_TLSLE48:
1525       {
1526         machine_mode mode = GET_MODE (dest);
1527         rtx tp = aarch64_load_tp (NULL);
1528
1529         if (mode != Pmode)
1530           tp = gen_lowpart (mode, tp);
1531
1532         switch (type)
1533           {
1534           case SYMBOL_TLSLE12:
1535             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1536                         (dest, tp, imm));
1537             break;
1538           case SYMBOL_TLSLE24:
1539             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1540                         (dest, tp, imm));
1541           break;
1542           case SYMBOL_TLSLE32:
1543             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1544                         (dest, imm));
1545             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1546                         (dest, dest, tp));
1547           break;
1548           case SYMBOL_TLSLE48:
1549             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1550                         (dest, imm));
1551             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1552                         (dest, dest, tp));
1553             break;
1554           default:
1555             gcc_unreachable ();
1556           }
1557
1558         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1559         return;
1560       }
1561
1562     case SYMBOL_TINY_GOT:
1563       emit_insn (gen_ldr_got_tiny (dest, imm));
1564       return;
1565
1566     case SYMBOL_TINY_TLSIE:
1567       {
1568         machine_mode mode = GET_MODE (dest);
1569         rtx tp = aarch64_load_tp (NULL);
1570
1571         if (mode == ptr_mode)
1572           {
1573             if (mode == DImode)
1574               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1575             else
1576               {
1577                 tp = gen_lowpart (mode, tp);
1578                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1579               }
1580           }
1581         else
1582           {
1583             gcc_assert (mode == Pmode);
1584             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1585           }
1586
1587         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1588         return;
1589       }
1590
1591     default:
1592       gcc_unreachable ();
1593     }
1594 }
1595
1596 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1597    handle all moves if !can_create_pseudo_p ().  The distinction is
1598    important because, unlike emit_move_insn, the move expanders know
1599    how to force Pmode objects into the constant pool even when the
1600    constant pool address is not itself legitimate.  */
1601 static rtx
1602 aarch64_emit_move (rtx dest, rtx src)
1603 {
1604   return (can_create_pseudo_p ()
1605           ? emit_move_insn (dest, src)
1606           : emit_move_insn_1 (dest, src));
1607 }
1608
1609 /* Split a 128-bit move operation into two 64-bit move operations,
1610    taking care to handle partial overlap of register to register
1611    copies.  Special cases are needed when moving between GP regs and
1612    FP regs.  SRC can be a register, constant or memory; DST a register
1613    or memory.  If either operand is memory it must not have any side
1614    effects.  */
1615 void
1616 aarch64_split_128bit_move (rtx dst, rtx src)
1617 {
1618   rtx dst_lo, dst_hi;
1619   rtx src_lo, src_hi;
1620
1621   machine_mode mode = GET_MODE (dst);
1622
1623   gcc_assert (mode == TImode || mode == TFmode);
1624   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1625   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1626
1627   if (REG_P (dst) && REG_P (src))
1628     {
1629       int src_regno = REGNO (src);
1630       int dst_regno = REGNO (dst);
1631
1632       /* Handle FP <-> GP regs.  */
1633       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1634         {
1635           src_lo = gen_lowpart (word_mode, src);
1636           src_hi = gen_highpart (word_mode, src);
1637
1638           if (mode == TImode)
1639             {
1640               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1641               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1642             }
1643           else
1644             {
1645               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1646               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1647             }
1648           return;
1649         }
1650       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1651         {
1652           dst_lo = gen_lowpart (word_mode, dst);
1653           dst_hi = gen_highpart (word_mode, dst);
1654
1655           if (mode == TImode)
1656             {
1657               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1658               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1659             }
1660           else
1661             {
1662               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1663               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1664             }
1665           return;
1666         }
1667     }
1668
1669   dst_lo = gen_lowpart (word_mode, dst);
1670   dst_hi = gen_highpart (word_mode, dst);
1671   src_lo = gen_lowpart (word_mode, src);
1672   src_hi = gen_highpart_mode (word_mode, mode, src);
1673
1674   /* At most one pairing may overlap.  */
1675   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1676     {
1677       aarch64_emit_move (dst_hi, src_hi);
1678       aarch64_emit_move (dst_lo, src_lo);
1679     }
1680   else
1681     {
1682       aarch64_emit_move (dst_lo, src_lo);
1683       aarch64_emit_move (dst_hi, src_hi);
1684     }
1685 }
1686
1687 bool
1688 aarch64_split_128bit_move_p (rtx dst, rtx src)
1689 {
1690   return (! REG_P (src)
1691           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1692 }
1693
1694 /* Split a complex SIMD combine.  */
1695
1696 void
1697 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1698 {
1699   machine_mode src_mode = GET_MODE (src1);
1700   machine_mode dst_mode = GET_MODE (dst);
1701
1702   gcc_assert (VECTOR_MODE_P (dst_mode));
1703   gcc_assert (register_operand (dst, dst_mode)
1704               && register_operand (src1, src_mode)
1705               && register_operand (src2, src_mode));
1706
1707   rtx (*gen) (rtx, rtx, rtx);
1708
1709   switch (src_mode)
1710     {
1711     case E_V8QImode:
1712       gen = gen_aarch64_simd_combinev8qi;
1713       break;
1714     case E_V4HImode:
1715       gen = gen_aarch64_simd_combinev4hi;
1716       break;
1717     case E_V2SImode:
1718       gen = gen_aarch64_simd_combinev2si;
1719       break;
1720     case E_V4HFmode:
1721       gen = gen_aarch64_simd_combinev4hf;
1722       break;
1723     case E_V2SFmode:
1724       gen = gen_aarch64_simd_combinev2sf;
1725       break;
1726     case E_DImode:
1727       gen = gen_aarch64_simd_combinedi;
1728       break;
1729     case E_DFmode:
1730       gen = gen_aarch64_simd_combinedf;
1731       break;
1732     default:
1733       gcc_unreachable ();
1734     }
1735
1736   emit_insn (gen (dst, src1, src2));
1737   return;
1738 }
1739
1740 /* Split a complex SIMD move.  */
1741
1742 void
1743 aarch64_split_simd_move (rtx dst, rtx src)
1744 {
1745   machine_mode src_mode = GET_MODE (src);
1746   machine_mode dst_mode = GET_MODE (dst);
1747
1748   gcc_assert (VECTOR_MODE_P (dst_mode));
1749
1750   if (REG_P (dst) && REG_P (src))
1751     {
1752       rtx (*gen) (rtx, rtx);
1753
1754       gcc_assert (VECTOR_MODE_P (src_mode));
1755
1756       switch (src_mode)
1757         {
1758         case E_V16QImode:
1759           gen = gen_aarch64_split_simd_movv16qi;
1760           break;
1761         case E_V8HImode:
1762           gen = gen_aarch64_split_simd_movv8hi;
1763           break;
1764         case E_V4SImode:
1765           gen = gen_aarch64_split_simd_movv4si;
1766           break;
1767         case E_V2DImode:
1768           gen = gen_aarch64_split_simd_movv2di;
1769           break;
1770         case E_V8HFmode:
1771           gen = gen_aarch64_split_simd_movv8hf;
1772           break;
1773         case E_V4SFmode:
1774           gen = gen_aarch64_split_simd_movv4sf;
1775           break;
1776         case E_V2DFmode:
1777           gen = gen_aarch64_split_simd_movv2df;
1778           break;
1779         default:
1780           gcc_unreachable ();
1781         }
1782
1783       emit_insn (gen (dst, src));
1784       return;
1785     }
1786 }
1787
1788 bool
1789 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1790                               machine_mode ymode, rtx y)
1791 {
1792   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1793   gcc_assert (r != NULL);
1794   return rtx_equal_p (x, r);
1795 }
1796
1797
1798 static rtx
1799 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1800 {
1801   if (can_create_pseudo_p ())
1802     return force_reg (mode, value);
1803   else
1804     {
1805       x = aarch64_emit_move (x, value);
1806       return x;
1807     }
1808 }
1809
1810
1811 static rtx
1812 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1813                     HOST_WIDE_INT offset)
1814 {
1815   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1816     {
1817       rtx high;
1818       /* Load the full offset into a register.  This
1819          might be improvable in the future.  */
1820       high = GEN_INT (offset);
1821       offset = 0;
1822       high = aarch64_force_temporary (mode, temp, high);
1823       reg = aarch64_force_temporary (mode, temp,
1824                                      gen_rtx_PLUS (mode, high, reg));
1825     }
1826   return plus_constant (mode, reg, offset);
1827 }
1828
1829 static int
1830 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1831                                 scalar_int_mode mode)
1832 {
1833   int i;
1834   unsigned HOST_WIDE_INT val, val2, mask;
1835   int one_match, zero_match;
1836   int num_insns;
1837
1838   val = INTVAL (imm);
1839
1840   if (aarch64_move_imm (val, mode))
1841     {
1842       if (generate)
1843         emit_insn (gen_rtx_SET (dest, imm));
1844       return 1;
1845     }
1846
1847   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1848      (with XXXX non-zero). In that case check to see if the move can be done in
1849      a smaller mode.  */
1850   val2 = val & 0xffffffff;
1851   if (mode == DImode
1852       && aarch64_move_imm (val2, SImode)
1853       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1854     {
1855       if (generate)
1856         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1857
1858       /* Check if we have to emit a second instruction by checking to see
1859          if any of the upper 32 bits of the original DI mode value is set.  */
1860       if (val == val2)
1861         return 1;
1862
1863       i = (val >> 48) ? 48 : 32;
1864
1865       if (generate)
1866          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1867                                     GEN_INT ((val >> i) & 0xffff)));
1868
1869       return 2;
1870     }
1871
1872   if ((val >> 32) == 0 || mode == SImode)
1873     {
1874       if (generate)
1875         {
1876           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1877           if (mode == SImode)
1878             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1879                                        GEN_INT ((val >> 16) & 0xffff)));
1880           else
1881             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1882                                        GEN_INT ((val >> 16) & 0xffff)));
1883         }
1884       return 2;
1885     }
1886
1887   /* Remaining cases are all for DImode.  */
1888
1889   mask = 0xffff;
1890   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1891     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1892   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1893     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1894
1895   if (zero_match != 2 && one_match != 2)
1896     {
1897       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1898          For a 64-bit bitmask try whether changing 16 bits to all ones or
1899          zeroes creates a valid bitmask.  To check any repeated bitmask,
1900          try using 16 bits from the other 32-bit half of val.  */
1901
1902       for (i = 0; i < 64; i += 16, mask <<= 16)
1903         {
1904           val2 = val & ~mask;
1905           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1906             break;
1907           val2 = val | mask;
1908           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1909             break;
1910           val2 = val2 & ~mask;
1911           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1912           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1913             break;
1914         }
1915       if (i != 64)
1916         {
1917           if (generate)
1918             {
1919               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1920               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1921                                          GEN_INT ((val >> i) & 0xffff)));
1922             }
1923           return 2;
1924         }
1925     }
1926
1927   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1928      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1929      otherwise skip zero bits.  */
1930
1931   num_insns = 1;
1932   mask = 0xffff;
1933   val2 = one_match > zero_match ? ~val : val;
1934   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1935
1936   if (generate)
1937     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1938                                            ? (val | ~(mask << i))
1939                                            : (val & (mask << i)))));
1940   for (i += 16; i < 64; i += 16)
1941     {
1942       if ((val2 & (mask << i)) == 0)
1943         continue;
1944       if (generate)
1945         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1946                                    GEN_INT ((val >> i) & 0xffff)));
1947       num_insns ++;
1948     }
1949
1950   return num_insns;
1951 }
1952
1953
1954 void
1955 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1956 {
1957   machine_mode mode = GET_MODE (dest);
1958
1959   gcc_assert (mode == SImode || mode == DImode);
1960
1961   /* Check on what type of symbol it is.  */
1962   scalar_int_mode int_mode;
1963   if ((GET_CODE (imm) == SYMBOL_REF
1964        || GET_CODE (imm) == LABEL_REF
1965        || GET_CODE (imm) == CONST)
1966       && is_a <scalar_int_mode> (mode, &int_mode))
1967     {
1968       rtx mem, base, offset;
1969       enum aarch64_symbol_type sty;
1970
1971       /* If we have (const (plus symbol offset)), separate out the offset
1972          before we start classifying the symbol.  */
1973       split_const (imm, &base, &offset);
1974
1975       sty = aarch64_classify_symbol (base, offset);
1976       switch (sty)
1977         {
1978         case SYMBOL_FORCE_TO_MEM:
1979           if (offset != const0_rtx
1980               && targetm.cannot_force_const_mem (int_mode, imm))
1981             {
1982               gcc_assert (can_create_pseudo_p ());
1983               base = aarch64_force_temporary (int_mode, dest, base);
1984               base = aarch64_add_offset (int_mode, NULL, base,
1985                                          INTVAL (offset));
1986               aarch64_emit_move (dest, base);
1987               return;
1988             }
1989
1990           mem = force_const_mem (ptr_mode, imm);
1991           gcc_assert (mem);
1992
1993           /* If we aren't generating PC relative literals, then
1994              we need to expand the literal pool access carefully.
1995              This is something that needs to be done in a number
1996              of places, so could well live as a separate function.  */
1997           if (!aarch64_pcrelative_literal_loads)
1998             {
1999               gcc_assert (can_create_pseudo_p ());
2000               base = gen_reg_rtx (ptr_mode);
2001               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2002               if (ptr_mode != Pmode)
2003                 base = convert_memory_address (Pmode, base);
2004               mem = gen_rtx_MEM (ptr_mode, base);
2005             }
2006
2007           if (int_mode != ptr_mode)
2008             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2009
2010           emit_insn (gen_rtx_SET (dest, mem));
2011
2012           return;
2013
2014         case SYMBOL_SMALL_TLSGD:
2015         case SYMBOL_SMALL_TLSDESC:
2016         case SYMBOL_SMALL_TLSIE:
2017         case SYMBOL_SMALL_GOT_28K:
2018         case SYMBOL_SMALL_GOT_4G:
2019         case SYMBOL_TINY_GOT:
2020         case SYMBOL_TINY_TLSIE:
2021           if (offset != const0_rtx)
2022             {
2023               gcc_assert(can_create_pseudo_p ());
2024               base = aarch64_force_temporary (int_mode, dest, base);
2025               base = aarch64_add_offset (int_mode, NULL, base,
2026                                          INTVAL (offset));
2027               aarch64_emit_move (dest, base);
2028               return;
2029             }
2030           /* FALLTHRU */
2031
2032         case SYMBOL_SMALL_ABSOLUTE:
2033         case SYMBOL_TINY_ABSOLUTE:
2034         case SYMBOL_TLSLE12:
2035         case SYMBOL_TLSLE24:
2036         case SYMBOL_TLSLE32:
2037         case SYMBOL_TLSLE48:
2038           aarch64_load_symref_appropriately (dest, imm, sty);
2039           return;
2040
2041         default:
2042           gcc_unreachable ();
2043         }
2044     }
2045
2046   if (!CONST_INT_P (imm))
2047     {
2048       if (GET_CODE (imm) == HIGH)
2049         emit_insn (gen_rtx_SET (dest, imm));
2050       else
2051         {
2052           rtx mem = force_const_mem (mode, imm);
2053           gcc_assert (mem);
2054           emit_insn (gen_rtx_SET (dest, mem));
2055         }
2056
2057       return;
2058     }
2059
2060   aarch64_internal_mov_immediate (dest, imm, true,
2061                                   as_a <scalar_int_mode> (mode));
2062 }
2063
2064 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2065    temporary value if necessary.  FRAME_RELATED_P should be true if
2066    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2067    to the generated instructions.  If SCRATCHREG is known to hold
2068    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2069    immediate again.
2070
2071    Since this function may be used to adjust the stack pointer, we must
2072    ensure that it cannot cause transient stack deallocation (for example
2073    by first incrementing SP and then decrementing when adjusting by a
2074    large immediate).  */
2075
2076 static void
2077 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2078                                int scratchreg, HOST_WIDE_INT delta,
2079                                bool frame_related_p, bool emit_move_imm)
2080 {
2081   HOST_WIDE_INT mdelta = abs_hwi (delta);
2082   rtx this_rtx = gen_rtx_REG (mode, regnum);
2083   rtx_insn *insn;
2084
2085   if (!mdelta)
2086     return;
2087
2088   /* Single instruction adjustment.  */
2089   if (aarch64_uimm12_shift (mdelta))
2090     {
2091       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2092       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2093       return;
2094     }
2095
2096   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2097      Only do this if mdelta is not a 16-bit move as adjusting using a move
2098      is better.  */
2099   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2100     {
2101       HOST_WIDE_INT low_off = mdelta & 0xfff;
2102
2103       low_off = delta < 0 ? -low_off : low_off;
2104       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2105       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2106       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2107       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2108       return;
2109     }
2110
2111   /* Emit a move immediate if required and an addition/subtraction.  */
2112   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2113   if (emit_move_imm)
2114     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2115   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2116                               : gen_add2_insn (this_rtx, scratch_rtx));
2117   if (frame_related_p)
2118     {
2119       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2120       rtx adj = plus_constant (mode, this_rtx, delta);
2121       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2122     }
2123 }
2124
2125 static inline void
2126 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2127                       HOST_WIDE_INT delta)
2128 {
2129   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2130 }
2131
2132 static inline void
2133 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2134 {
2135   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2136                                  true, emit_move_imm);
2137 }
2138
2139 static inline void
2140 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2141 {
2142   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2143                                  frame_related_p, true);
2144 }
2145
2146 static bool
2147 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2148                                  tree exp ATTRIBUTE_UNUSED)
2149 {
2150   /* Currently, always true.  */
2151   return true;
2152 }
2153
2154 /* Implement TARGET_PASS_BY_REFERENCE.  */
2155
2156 static bool
2157 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2158                            machine_mode mode,
2159                            const_tree type,
2160                            bool named ATTRIBUTE_UNUSED)
2161 {
2162   HOST_WIDE_INT size;
2163   machine_mode dummymode;
2164   int nregs;
2165
2166   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2167   size = (mode == BLKmode && type)
2168     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2169
2170   /* Aggregates are passed by reference based on their size.  */
2171   if (type && AGGREGATE_TYPE_P (type))
2172     {
2173       size = int_size_in_bytes (type);
2174     }
2175
2176   /* Variable sized arguments are always returned by reference.  */
2177   if (size < 0)
2178     return true;
2179
2180   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2181   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2182                                                &dummymode, &nregs,
2183                                                NULL))
2184     return false;
2185
2186   /* Arguments which are variable sized or larger than 2 registers are
2187      passed by reference unless they are a homogenous floating point
2188      aggregate.  */
2189   return size > 2 * UNITS_PER_WORD;
2190 }
2191
2192 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2193 static bool
2194 aarch64_return_in_msb (const_tree valtype)
2195 {
2196   machine_mode dummy_mode;
2197   int dummy_int;
2198
2199   /* Never happens in little-endian mode.  */
2200   if (!BYTES_BIG_ENDIAN)
2201     return false;
2202
2203   /* Only composite types smaller than or equal to 16 bytes can
2204      be potentially returned in registers.  */
2205   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2206       || int_size_in_bytes (valtype) <= 0
2207       || int_size_in_bytes (valtype) > 16)
2208     return false;
2209
2210   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2211      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2212      is always passed/returned in the least significant bits of fp/simd
2213      register(s).  */
2214   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2215                                                &dummy_mode, &dummy_int, NULL))
2216     return false;
2217
2218   return true;
2219 }
2220
2221 /* Implement TARGET_FUNCTION_VALUE.
2222    Define how to find the value returned by a function.  */
2223
2224 static rtx
2225 aarch64_function_value (const_tree type, const_tree func,
2226                         bool outgoing ATTRIBUTE_UNUSED)
2227 {
2228   machine_mode mode;
2229   int unsignedp;
2230   int count;
2231   machine_mode ag_mode;
2232
2233   mode = TYPE_MODE (type);
2234   if (INTEGRAL_TYPE_P (type))
2235     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2236
2237   if (aarch64_return_in_msb (type))
2238     {
2239       HOST_WIDE_INT size = int_size_in_bytes (type);
2240
2241       if (size % UNITS_PER_WORD != 0)
2242         {
2243           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2244           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2245         }
2246     }
2247
2248   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2249                                                &ag_mode, &count, NULL))
2250     {
2251       if (!aarch64_composite_type_p (type, mode))
2252         {
2253           gcc_assert (count == 1 && mode == ag_mode);
2254           return gen_rtx_REG (mode, V0_REGNUM);
2255         }
2256       else
2257         {
2258           int i;
2259           rtx par;
2260
2261           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2262           for (i = 0; i < count; i++)
2263             {
2264               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2265               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2266                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2267               XVECEXP (par, 0, i) = tmp;
2268             }
2269           return par;
2270         }
2271     }
2272   else
2273     return gen_rtx_REG (mode, R0_REGNUM);
2274 }
2275
2276 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2277    Return true if REGNO is the number of a hard register in which the values
2278    of called function may come back.  */
2279
2280 static bool
2281 aarch64_function_value_regno_p (const unsigned int regno)
2282 {
2283   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2284      of 16-byte return values are: 128-bit integers and 16-byte small
2285      structures (excluding homogeneous floating-point aggregates).  */
2286   if (regno == R0_REGNUM || regno == R1_REGNUM)
2287     return true;
2288
2289   /* Up to four fp/simd registers can return a function value, e.g. a
2290      homogeneous floating-point aggregate having four members.  */
2291   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2292     return TARGET_FLOAT;
2293
2294   return false;
2295 }
2296
2297 /* Implement TARGET_RETURN_IN_MEMORY.
2298
2299    If the type T of the result of a function is such that
2300      void func (T arg)
2301    would require that arg be passed as a value in a register (or set of
2302    registers) according to the parameter passing rules, then the result
2303    is returned in the same registers as would be used for such an
2304    argument.  */
2305
2306 static bool
2307 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2308 {
2309   HOST_WIDE_INT size;
2310   machine_mode ag_mode;
2311   int count;
2312
2313   if (!AGGREGATE_TYPE_P (type)
2314       && TREE_CODE (type) != COMPLEX_TYPE
2315       && TREE_CODE (type) != VECTOR_TYPE)
2316     /* Simple scalar types always returned in registers.  */
2317     return false;
2318
2319   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2320                                                type,
2321                                                &ag_mode,
2322                                                &count,
2323                                                NULL))
2324     return false;
2325
2326   /* Types larger than 2 registers returned in memory.  */
2327   size = int_size_in_bytes (type);
2328   return (size < 0 || size > 2 * UNITS_PER_WORD);
2329 }
2330
2331 static bool
2332 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2333                                const_tree type, int *nregs)
2334 {
2335   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2336   return aarch64_vfp_is_call_or_return_candidate (mode,
2337                                                   type,
2338                                                   &pcum->aapcs_vfp_rmode,
2339                                                   nregs,
2340                                                   NULL);
2341 }
2342
2343 /* Given MODE and TYPE of a function argument, return the alignment in
2344    bits.  The idea is to suppress any stronger alignment requested by
2345    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2346    This is a helper function for local use only.  */
2347
2348 static unsigned int
2349 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2350 {
2351   if (!type)
2352     return GET_MODE_ALIGNMENT (mode);
2353
2354   if (integer_zerop (TYPE_SIZE (type)))
2355     return 0;
2356
2357   gcc_assert (TYPE_MODE (type) == mode);
2358
2359   if (!AGGREGATE_TYPE_P (type))
2360     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2361
2362   if (TREE_CODE (type) == ARRAY_TYPE)
2363     return TYPE_ALIGN (TREE_TYPE (type));
2364
2365   unsigned int alignment = 0;
2366   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2367     if (TREE_CODE (field) == FIELD_DECL)
2368       alignment = std::max (alignment, DECL_ALIGN (field));
2369
2370   return alignment;
2371 }
2372
2373 /* Layout a function argument according to the AAPCS64 rules.  The rule
2374    numbers refer to the rule numbers in the AAPCS64.  */
2375
2376 static void
2377 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2378                     const_tree type,
2379                     bool named ATTRIBUTE_UNUSED)
2380 {
2381   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2382   int ncrn, nvrn, nregs;
2383   bool allocate_ncrn, allocate_nvrn;
2384   HOST_WIDE_INT size;
2385
2386   /* We need to do this once per argument.  */
2387   if (pcum->aapcs_arg_processed)
2388     return;
2389
2390   pcum->aapcs_arg_processed = true;
2391
2392   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2393   size
2394     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2395                 UNITS_PER_WORD);
2396
2397   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2398   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2399                                                  mode,
2400                                                  type,
2401                                                  &nregs);
2402
2403   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2404      The following code thus handles passing by SIMD/FP registers first.  */
2405
2406   nvrn = pcum->aapcs_nvrn;
2407
2408   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2409      and homogenous short-vector aggregates (HVA).  */
2410   if (allocate_nvrn)
2411     {
2412       if (!TARGET_FLOAT)
2413         aarch64_err_no_fpadvsimd (mode, "argument");
2414
2415       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2416         {
2417           pcum->aapcs_nextnvrn = nvrn + nregs;
2418           if (!aarch64_composite_type_p (type, mode))
2419             {
2420               gcc_assert (nregs == 1);
2421               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2422             }
2423           else
2424             {
2425               rtx par;
2426               int i;
2427               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2428               for (i = 0; i < nregs; i++)
2429                 {
2430                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2431                                          V0_REGNUM + nvrn + i);
2432                   tmp = gen_rtx_EXPR_LIST
2433                     (VOIDmode, tmp,
2434                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2435                   XVECEXP (par, 0, i) = tmp;
2436                 }
2437               pcum->aapcs_reg = par;
2438             }
2439           return;
2440         }
2441       else
2442         {
2443           /* C.3 NSRN is set to 8.  */
2444           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2445           goto on_stack;
2446         }
2447     }
2448
2449   ncrn = pcum->aapcs_ncrn;
2450   nregs = size / UNITS_PER_WORD;
2451
2452   /* C6 - C9.  though the sign and zero extension semantics are
2453      handled elsewhere.  This is the case where the argument fits
2454      entirely general registers.  */
2455   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2456     {
2457
2458       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2459
2460       /* C.8 if the argument has an alignment of 16 then the NGRN is
2461          rounded up to the next even number.  */
2462       if (nregs == 2
2463           && ncrn % 2
2464           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2465              comparison is there because for > 16 * BITS_PER_UNIT
2466              alignment nregs should be > 2 and therefore it should be
2467              passed by reference rather than value.  */
2468           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2469         {
2470           ++ncrn;
2471           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2472         }
2473
2474       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2475          A reg is still generated for it, but the caller should be smart
2476          enough not to use it.  */
2477       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2478         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2479       else
2480         {
2481           rtx par;
2482           int i;
2483
2484           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2485           for (i = 0; i < nregs; i++)
2486             {
2487               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2488               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2489                                        GEN_INT (i * UNITS_PER_WORD));
2490               XVECEXP (par, 0, i) = tmp;
2491             }
2492           pcum->aapcs_reg = par;
2493         }
2494
2495       pcum->aapcs_nextncrn = ncrn + nregs;
2496       return;
2497     }
2498
2499   /* C.11  */
2500   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2501
2502   /* The argument is passed on stack; record the needed number of words for
2503      this argument and align the total size if necessary.  */
2504 on_stack:
2505   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2506
2507   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2508     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2509                                        16 / UNITS_PER_WORD);
2510   return;
2511 }
2512
2513 /* Implement TARGET_FUNCTION_ARG.  */
2514
2515 static rtx
2516 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2517                       const_tree type, bool named)
2518 {
2519   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2520   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2521
2522   if (mode == VOIDmode)
2523     return NULL_RTX;
2524
2525   aarch64_layout_arg (pcum_v, mode, type, named);
2526   return pcum->aapcs_reg;
2527 }
2528
2529 void
2530 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2531                            const_tree fntype ATTRIBUTE_UNUSED,
2532                            rtx libname ATTRIBUTE_UNUSED,
2533                            const_tree fndecl ATTRIBUTE_UNUSED,
2534                            unsigned n_named ATTRIBUTE_UNUSED)
2535 {
2536   pcum->aapcs_ncrn = 0;
2537   pcum->aapcs_nvrn = 0;
2538   pcum->aapcs_nextncrn = 0;
2539   pcum->aapcs_nextnvrn = 0;
2540   pcum->pcs_variant = ARM_PCS_AAPCS64;
2541   pcum->aapcs_reg = NULL_RTX;
2542   pcum->aapcs_arg_processed = false;
2543   pcum->aapcs_stack_words = 0;
2544   pcum->aapcs_stack_size = 0;
2545
2546   if (!TARGET_FLOAT
2547       && fndecl && TREE_PUBLIC (fndecl)
2548       && fntype && fntype != error_mark_node)
2549     {
2550       const_tree type = TREE_TYPE (fntype);
2551       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2552       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2553       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2554                                                    &mode, &nregs, NULL))
2555         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2556     }
2557   return;
2558 }
2559
2560 static void
2561 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2562                               machine_mode mode,
2563                               const_tree type,
2564                               bool named)
2565 {
2566   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2567   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2568     {
2569       aarch64_layout_arg (pcum_v, mode, type, named);
2570       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2571                   != (pcum->aapcs_stack_words != 0));
2572       pcum->aapcs_arg_processed = false;
2573       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2574       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2575       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2576       pcum->aapcs_stack_words = 0;
2577       pcum->aapcs_reg = NULL_RTX;
2578     }
2579 }
2580
2581 bool
2582 aarch64_function_arg_regno_p (unsigned regno)
2583 {
2584   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2585           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2586 }
2587
2588 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2589    PARM_BOUNDARY bits of alignment, but will be given anything up
2590    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2591    that both before and after the layout of each argument, the Next
2592    Stacked Argument Address (NSAA) will have a minimum alignment of
2593    8 bytes.  */
2594
2595 static unsigned int
2596 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2597 {
2598   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2599   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2600 }
2601
2602 /* Implement TARGET_FUNCTION_ARG_PADDING.
2603
2604    Small aggregate types are placed in the lowest memory address.
2605
2606    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2607
2608 static pad_direction
2609 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2610 {
2611   /* On little-endian targets, the least significant byte of every stack
2612      argument is passed at the lowest byte address of the stack slot.  */
2613   if (!BYTES_BIG_ENDIAN)
2614     return PAD_UPWARD;
2615
2616   /* Otherwise, integral, floating-point and pointer types are padded downward:
2617      the least significant byte of a stack argument is passed at the highest
2618      byte address of the stack slot.  */
2619   if (type
2620       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2621          || POINTER_TYPE_P (type))
2622       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2623     return PAD_DOWNWARD;
2624
2625   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2626   return PAD_UPWARD;
2627 }
2628
2629 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2630
2631    It specifies padding for the last (may also be the only)
2632    element of a block move between registers and memory.  If
2633    assuming the block is in the memory, padding upward means that
2634    the last element is padded after its highest significant byte,
2635    while in downward padding, the last element is padded at the
2636    its least significant byte side.
2637
2638    Small aggregates and small complex types are always padded
2639    upwards.
2640
2641    We don't need to worry about homogeneous floating-point or
2642    short-vector aggregates; their move is not affected by the
2643    padding direction determined here.  Regardless of endianness,
2644    each element of such an aggregate is put in the least
2645    significant bits of a fp/simd register.
2646
2647    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2648    register has useful data, and return the opposite if the most
2649    significant byte does.  */
2650
2651 bool
2652 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2653                      bool first ATTRIBUTE_UNUSED)
2654 {
2655
2656   /* Small composite types are always padded upward.  */
2657   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2658     {
2659       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2660                             : GET_MODE_SIZE (mode));
2661       if (size < 2 * UNITS_PER_WORD)
2662         return true;
2663     }
2664
2665   /* Otherwise, use the default padding.  */
2666   return !BYTES_BIG_ENDIAN;
2667 }
2668
2669 static scalar_int_mode
2670 aarch64_libgcc_cmp_return_mode (void)
2671 {
2672   return SImode;
2673 }
2674
2675 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2676
2677 /* We use the 12-bit shifted immediate arithmetic instructions so values
2678    must be multiple of (1 << 12), i.e. 4096.  */
2679 #define ARITH_FACTOR 4096
2680
2681 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2682 #error Cannot use simple address calculation for stack probing
2683 #endif
2684
2685 /* The pair of scratch registers used for stack probing.  */
2686 #define PROBE_STACK_FIRST_REG  9
2687 #define PROBE_STACK_SECOND_REG 10
2688
2689 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2690    inclusive.  These are offsets from the current stack pointer.  */
2691
2692 static void
2693 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2694 {
2695   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2696
2697   /* See the same assertion on PROBE_INTERVAL above.  */
2698   gcc_assert ((first % ARITH_FACTOR) == 0);
2699
2700   /* See if we have a constant small number of probes to generate.  If so,
2701      that's the easy case.  */
2702   if (size <= PROBE_INTERVAL)
2703     {
2704       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2705
2706       emit_set_insn (reg1,
2707                      plus_constant (Pmode,
2708                                     stack_pointer_rtx, -(first + base)));
2709       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2710     }
2711
2712   /* The run-time loop is made up of 8 insns in the generic case while the
2713      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2714   else if (size <= 4 * PROBE_INTERVAL)
2715     {
2716       HOST_WIDE_INT i, rem;
2717
2718       emit_set_insn (reg1,
2719                      plus_constant (Pmode,
2720                                     stack_pointer_rtx,
2721                                     -(first + PROBE_INTERVAL)));
2722       emit_stack_probe (reg1);
2723
2724       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2725          it exceeds SIZE.  If only two probes are needed, this will not
2726          generate any code.  Then probe at FIRST + SIZE.  */
2727       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2728         {
2729           emit_set_insn (reg1,
2730                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2731           emit_stack_probe (reg1);
2732         }
2733
2734       rem = size - (i - PROBE_INTERVAL);
2735       if (rem > 256)
2736         {
2737           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2738
2739           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2740           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2741         }
2742       else
2743         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2744     }
2745
2746   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2747      extra careful with variables wrapping around because we might be at
2748      the very top (or the very bottom) of the address space and we have
2749      to be able to handle this case properly; in particular, we use an
2750      equality test for the loop condition.  */
2751   else
2752     {
2753       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2754
2755       /* Step 1: round SIZE to the previous multiple of the interval.  */
2756
2757       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2758
2759
2760       /* Step 2: compute initial and final value of the loop counter.  */
2761
2762       /* TEST_ADDR = SP + FIRST.  */
2763       emit_set_insn (reg1,
2764                      plus_constant (Pmode, stack_pointer_rtx, -first));
2765
2766       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2767       HOST_WIDE_INT adjustment = - (first + rounded_size);
2768       if (! aarch64_uimm12_shift (adjustment))
2769         {
2770           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2771                                           true, Pmode);
2772           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2773         }
2774       else
2775         {
2776           emit_set_insn (reg2,
2777                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2778         }
2779
2780       /* Step 3: the loop
2781
2782          do
2783            {
2784              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2785              probe at TEST_ADDR
2786            }
2787          while (TEST_ADDR != LAST_ADDR)
2788
2789          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2790          until it is equal to ROUNDED_SIZE.  */
2791
2792       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2793
2794
2795       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2796          that SIZE is equal to ROUNDED_SIZE.  */
2797
2798       if (size != rounded_size)
2799         {
2800           HOST_WIDE_INT rem = size - rounded_size;
2801
2802           if (rem > 256)
2803             {
2804               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2805
2806               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2807               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2808             }
2809           else
2810             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2811         }
2812     }
2813
2814   /* Make sure nothing is scheduled before we are done.  */
2815   emit_insn (gen_blockage ());
2816 }
2817
2818 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2819    absolute addresses.  */
2820
2821 const char *
2822 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2823 {
2824   static int labelno = 0;
2825   char loop_lab[32];
2826   rtx xops[2];
2827
2828   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2829
2830   /* Loop.  */
2831   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2832
2833   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2834   xops[0] = reg1;
2835   xops[1] = GEN_INT (PROBE_INTERVAL);
2836   output_asm_insn ("sub\t%0, %0, %1", xops);
2837
2838   /* Probe at TEST_ADDR.  */
2839   output_asm_insn ("str\txzr, [%0]", xops);
2840
2841   /* Test if TEST_ADDR == LAST_ADDR.  */
2842   xops[1] = reg2;
2843   output_asm_insn ("cmp\t%0, %1", xops);
2844
2845   /* Branch.  */
2846   fputs ("\tb.ne\t", asm_out_file);
2847   assemble_name_raw (asm_out_file, loop_lab);
2848   fputc ('\n', asm_out_file);
2849
2850   return "";
2851 }
2852
2853 static bool
2854 aarch64_frame_pointer_required (void)
2855 {
2856   /* In aarch64_override_options_after_change
2857      flag_omit_leaf_frame_pointer turns off the frame pointer by
2858      default.  Turn it back on now if we've not got a leaf
2859      function.  */
2860   if (flag_omit_leaf_frame_pointer
2861       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2862     return true;
2863
2864   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2865   if (crtl->calls_eh_return)
2866     return true;
2867
2868   return false;
2869 }
2870
2871 /* Mark the registers that need to be saved by the callee and calculate
2872    the size of the callee-saved registers area and frame record (both FP
2873    and LR may be omitted).  */
2874 static void
2875 aarch64_layout_frame (void)
2876 {
2877   HOST_WIDE_INT offset = 0;
2878   int regno, last_fp_reg = INVALID_REGNUM;
2879
2880   if (reload_completed && cfun->machine->frame.laid_out)
2881     return;
2882
2883 #define SLOT_NOT_REQUIRED (-2)
2884 #define SLOT_REQUIRED     (-1)
2885
2886   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2887   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2888
2889   /* First mark all the registers that really need to be saved...  */
2890   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2891     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2892
2893   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2894     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2895
2896   /* ... that includes the eh data registers (if needed)...  */
2897   if (crtl->calls_eh_return)
2898     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2899       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2900         = SLOT_REQUIRED;
2901
2902   /* ... and any callee saved register that dataflow says is live.  */
2903   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2904     if (df_regs_ever_live_p (regno)
2905         && (regno == R30_REGNUM
2906             || !call_used_regs[regno]))
2907       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2908
2909   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2910     if (df_regs_ever_live_p (regno)
2911         && !call_used_regs[regno])
2912       {
2913         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2914         last_fp_reg = regno;
2915       }
2916
2917   if (frame_pointer_needed)
2918     {
2919       /* FP and LR are placed in the linkage record.  */
2920       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2921       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2922       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2923       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2924       offset += 2 * UNITS_PER_WORD;
2925     }
2926
2927   /* Now assign stack slots for them.  */
2928   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2929     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2930       {
2931         cfun->machine->frame.reg_offset[regno] = offset;
2932         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2933           cfun->machine->frame.wb_candidate1 = regno;
2934         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2935           cfun->machine->frame.wb_candidate2 = regno;
2936         offset += UNITS_PER_WORD;
2937       }
2938
2939   HOST_WIDE_INT max_int_offset = offset;
2940   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2941   bool has_align_gap = offset != max_int_offset;
2942
2943   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2944     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2945       {
2946         /* If there is an alignment gap between integer and fp callee-saves,
2947            allocate the last fp register to it if possible.  */
2948         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2949           {
2950             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2951             break;
2952           }
2953
2954         cfun->machine->frame.reg_offset[regno] = offset;
2955         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2956           cfun->machine->frame.wb_candidate1 = regno;
2957         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2958                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2959           cfun->machine->frame.wb_candidate2 = regno;
2960         offset += UNITS_PER_WORD;
2961       }
2962
2963   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2964
2965   cfun->machine->frame.saved_regs_size = offset;
2966
2967   HOST_WIDE_INT varargs_and_saved_regs_size
2968     = offset + cfun->machine->frame.saved_varargs_size;
2969
2970   cfun->machine->frame.hard_fp_offset
2971     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2972                 STACK_BOUNDARY / BITS_PER_UNIT);
2973
2974   cfun->machine->frame.frame_size
2975     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2976                 + crtl->outgoing_args_size,
2977                 STACK_BOUNDARY / BITS_PER_UNIT);
2978
2979   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2980
2981   cfun->machine->frame.initial_adjust = 0;
2982   cfun->machine->frame.final_adjust = 0;
2983   cfun->machine->frame.callee_adjust = 0;
2984   cfun->machine->frame.callee_offset = 0;
2985
2986   HOST_WIDE_INT max_push_offset = 0;
2987   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2988     max_push_offset = 512;
2989   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2990     max_push_offset = 256;
2991
2992   if (cfun->machine->frame.frame_size < max_push_offset
2993       && crtl->outgoing_args_size == 0)
2994     {
2995       /* Simple, small frame with no outgoing arguments:
2996          stp reg1, reg2, [sp, -frame_size]!
2997          stp reg3, reg4, [sp, 16]  */
2998       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2999     }
3000   else if ((crtl->outgoing_args_size
3001             + cfun->machine->frame.saved_regs_size < 512)
3002            && !(cfun->calls_alloca
3003                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3004     {
3005       /* Frame with small outgoing arguments:
3006          sub sp, sp, frame_size
3007          stp reg1, reg2, [sp, outgoing_args_size]
3008          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3009       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3010       cfun->machine->frame.callee_offset
3011         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3012     }
3013   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3014     {
3015       /* Frame with large outgoing arguments but a small local area:
3016          stp reg1, reg2, [sp, -hard_fp_offset]!
3017          stp reg3, reg4, [sp, 16]
3018          sub sp, sp, outgoing_args_size  */
3019       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3020       cfun->machine->frame.final_adjust
3021         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3022     }
3023   else if (!frame_pointer_needed
3024            && varargs_and_saved_regs_size < max_push_offset)
3025     {
3026       /* Frame with large local area and outgoing arguments (this pushes the
3027          callee-saves first, followed by the locals and outgoing area):
3028          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3029          stp reg3, reg4, [sp, 16]
3030          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3031       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3032       cfun->machine->frame.final_adjust
3033         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3034       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3035       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3036     }
3037   else
3038     {
3039       /* Frame with large local area and outgoing arguments using frame pointer:
3040          sub sp, sp, hard_fp_offset
3041          stp x29, x30, [sp, 0]
3042          add x29, sp, 0
3043          stp reg3, reg4, [sp, 16]
3044          sub sp, sp, outgoing_args_size  */
3045       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3046       cfun->machine->frame.final_adjust
3047         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3048     }
3049
3050   cfun->machine->frame.laid_out = true;
3051 }
3052
3053 /* Return true if the register REGNO is saved on entry to
3054    the current function.  */
3055
3056 static bool
3057 aarch64_register_saved_on_entry (int regno)
3058 {
3059   return cfun->machine->frame.reg_offset[regno] >= 0;
3060 }
3061
3062 /* Return the next register up from REGNO up to LIMIT for the callee
3063    to save.  */
3064
3065 static unsigned
3066 aarch64_next_callee_save (unsigned regno, unsigned limit)
3067 {
3068   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3069     regno ++;
3070   return regno;
3071 }
3072
3073 /* Push the register number REGNO of mode MODE to the stack with write-back
3074    adjusting the stack by ADJUSTMENT.  */
3075
3076 static void
3077 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3078                            HOST_WIDE_INT adjustment)
3079  {
3080   rtx base_rtx = stack_pointer_rtx;
3081   rtx insn, reg, mem;
3082
3083   reg = gen_rtx_REG (mode, regno);
3084   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3085                             plus_constant (Pmode, base_rtx, -adjustment));
3086   mem = gen_frame_mem (mode, mem);
3087
3088   insn = emit_move_insn (mem, reg);
3089   RTX_FRAME_RELATED_P (insn) = 1;
3090 }
3091
3092 /* Generate and return an instruction to store the pair of registers
3093    REG and REG2 of mode MODE to location BASE with write-back adjusting
3094    the stack location BASE by ADJUSTMENT.  */
3095
3096 static rtx
3097 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3098                           HOST_WIDE_INT adjustment)
3099 {
3100   switch (mode)
3101     {
3102     case E_DImode:
3103       return gen_storewb_pairdi_di (base, base, reg, reg2,
3104                                     GEN_INT (-adjustment),
3105                                     GEN_INT (UNITS_PER_WORD - adjustment));
3106     case E_DFmode:
3107       return gen_storewb_pairdf_di (base, base, reg, reg2,
3108                                     GEN_INT (-adjustment),
3109                                     GEN_INT (UNITS_PER_WORD - adjustment));
3110     default:
3111       gcc_unreachable ();
3112     }
3113 }
3114
3115 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3116    stack pointer by ADJUSTMENT.  */
3117
3118 static void
3119 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3120 {
3121   rtx_insn *insn;
3122   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3123
3124   if (regno2 == INVALID_REGNUM)
3125     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3126
3127   rtx reg1 = gen_rtx_REG (mode, regno1);
3128   rtx reg2 = gen_rtx_REG (mode, regno2);
3129
3130   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3131                                               reg2, adjustment));
3132   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3133   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3134   RTX_FRAME_RELATED_P (insn) = 1;
3135 }
3136
3137 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3138    adjusting it by ADJUSTMENT afterwards.  */
3139
3140 static rtx
3141 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3142                          HOST_WIDE_INT adjustment)
3143 {
3144   switch (mode)
3145     {
3146     case E_DImode:
3147       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3148                                    GEN_INT (UNITS_PER_WORD));
3149     case E_DFmode:
3150       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3151                                    GEN_INT (UNITS_PER_WORD));
3152     default:
3153       gcc_unreachable ();
3154     }
3155 }
3156
3157 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3158    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3159    into CFI_OPS.  */
3160
3161 static void
3162 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3163                   rtx *cfi_ops)
3164 {
3165   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3166   rtx reg1 = gen_rtx_REG (mode, regno1);
3167
3168   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3169
3170   if (regno2 == INVALID_REGNUM)
3171     {
3172       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3173       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3174       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3175     }
3176   else
3177     {
3178       rtx reg2 = gen_rtx_REG (mode, regno2);
3179       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3180       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3181                                           reg2, adjustment));
3182     }
3183 }
3184
3185 /* Generate and return a store pair instruction of mode MODE to store
3186    register REG1 to MEM1 and register REG2 to MEM2.  */
3187
3188 static rtx
3189 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3190                         rtx reg2)
3191 {
3192   switch (mode)
3193     {
3194     case E_DImode:
3195       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3196
3197     case E_DFmode:
3198       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3199
3200     default:
3201       gcc_unreachable ();
3202     }
3203 }
3204
3205 /* Generate and regurn a load pair isntruction of mode MODE to load register
3206    REG1 from MEM1 and register REG2 from MEM2.  */
3207
3208 static rtx
3209 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3210                        rtx mem2)
3211 {
3212   switch (mode)
3213     {
3214     case E_DImode:
3215       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3216
3217     case E_DFmode:
3218       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3219
3220     default:
3221       gcc_unreachable ();
3222     }
3223 }
3224
3225 /* Return TRUE if return address signing should be enabled for the current
3226    function, otherwise return FALSE.  */
3227
3228 bool
3229 aarch64_return_address_signing_enabled (void)
3230 {
3231   /* This function should only be called after frame laid out.   */
3232   gcc_assert (cfun->machine->frame.laid_out);
3233
3234   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3235      if it's LR is pushed onto stack.  */
3236   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3237           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3238               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3239 }
3240
3241 /* Emit code to save the callee-saved registers from register number START
3242    to LIMIT to the stack at the location starting at offset START_OFFSET,
3243    skipping any write-back candidates if SKIP_WB is true.  */
3244
3245 static void
3246 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3247                            unsigned start, unsigned limit, bool skip_wb)
3248 {
3249   rtx_insn *insn;
3250   unsigned regno;
3251   unsigned regno2;
3252
3253   for (regno = aarch64_next_callee_save (start, limit);
3254        regno <= limit;
3255        regno = aarch64_next_callee_save (regno + 1, limit))
3256     {
3257       rtx reg, mem;
3258       HOST_WIDE_INT offset;
3259
3260       if (skip_wb
3261           && (regno == cfun->machine->frame.wb_candidate1
3262               || regno == cfun->machine->frame.wb_candidate2))
3263         continue;
3264
3265       if (cfun->machine->reg_is_wrapped_separately[regno])
3266        continue;
3267
3268       reg = gen_rtx_REG (mode, regno);
3269       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3270       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3271                                                 offset));
3272
3273       regno2 = aarch64_next_callee_save (regno + 1, limit);
3274
3275       if (regno2 <= limit
3276           && !cfun->machine->reg_is_wrapped_separately[regno2]
3277           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3278               == cfun->machine->frame.reg_offset[regno2]))
3279
3280         {
3281           rtx reg2 = gen_rtx_REG (mode, regno2);
3282           rtx mem2;
3283
3284           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3285           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3286                                                      offset));
3287           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3288                                                     reg2));
3289
3290           /* The first part of a frame-related parallel insn is
3291              always assumed to be relevant to the frame
3292              calculations; subsequent parts, are only
3293              frame-related if explicitly marked.  */
3294           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3295           regno = regno2;
3296         }
3297       else
3298         insn = emit_move_insn (mem, reg);
3299
3300       RTX_FRAME_RELATED_P (insn) = 1;
3301     }
3302 }
3303
3304 /* Emit code to restore the callee registers of mode MODE from register
3305    number START up to and including LIMIT.  Restore from the stack offset
3306    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3307    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3308
3309 static void
3310 aarch64_restore_callee_saves (machine_mode mode,
3311                               HOST_WIDE_INT start_offset, unsigned start,
3312                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3313 {
3314   rtx base_rtx = stack_pointer_rtx;
3315   unsigned regno;
3316   unsigned regno2;
3317   HOST_WIDE_INT offset;
3318
3319   for (regno = aarch64_next_callee_save (start, limit);
3320        regno <= limit;
3321        regno = aarch64_next_callee_save (regno + 1, limit))
3322     {
3323       if (cfun->machine->reg_is_wrapped_separately[regno])
3324        continue;
3325
3326       rtx reg, mem;
3327
3328       if (skip_wb
3329           && (regno == cfun->machine->frame.wb_candidate1
3330               || regno == cfun->machine->frame.wb_candidate2))
3331         continue;
3332
3333       reg = gen_rtx_REG (mode, regno);
3334       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3335       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3336
3337       regno2 = aarch64_next_callee_save (regno + 1, limit);
3338
3339       if (regno2 <= limit
3340           && !cfun->machine->reg_is_wrapped_separately[regno2]
3341           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3342               == cfun->machine->frame.reg_offset[regno2]))
3343         {
3344           rtx reg2 = gen_rtx_REG (mode, regno2);
3345           rtx mem2;
3346
3347           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3348           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3349           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3350
3351           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3352           regno = regno2;
3353         }
3354       else
3355         emit_move_insn (reg, mem);
3356       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3357     }
3358 }
3359
3360 static inline bool
3361 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3362                                HOST_WIDE_INT offset)
3363 {
3364   return offset >= -256 && offset < 256;
3365 }
3366
3367 static inline bool
3368 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3369 {
3370   return (offset >= 0
3371           && offset < 4096 * GET_MODE_SIZE (mode)
3372           && offset % GET_MODE_SIZE (mode) == 0);
3373 }
3374
3375 bool
3376 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3377 {
3378   return (offset >= -64 * GET_MODE_SIZE (mode)
3379           && offset < 64 * GET_MODE_SIZE (mode)
3380           && offset % GET_MODE_SIZE (mode) == 0);
3381 }
3382
3383 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3384
3385 static sbitmap
3386 aarch64_get_separate_components (void)
3387 {
3388   aarch64_layout_frame ();
3389
3390   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3391   bitmap_clear (components);
3392
3393   /* The registers we need saved to the frame.  */
3394   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3395     if (aarch64_register_saved_on_entry (regno))
3396       {
3397         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3398         if (!frame_pointer_needed)
3399           offset += cfun->machine->frame.frame_size
3400                     - cfun->machine->frame.hard_fp_offset;
3401         /* Check that we can access the stack slot of the register with one
3402            direct load with no adjustments needed.  */
3403         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3404           bitmap_set_bit (components, regno);
3405       }
3406
3407   /* Don't mess with the hard frame pointer.  */
3408   if (frame_pointer_needed)
3409     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3410
3411   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3412   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3413   /* If aarch64_layout_frame has chosen registers to store/restore with
3414      writeback don't interfere with them to avoid having to output explicit
3415      stack adjustment instructions.  */
3416   if (reg2 != INVALID_REGNUM)
3417     bitmap_clear_bit (components, reg2);
3418   if (reg1 != INVALID_REGNUM)
3419     bitmap_clear_bit (components, reg1);
3420
3421   bitmap_clear_bit (components, LR_REGNUM);
3422   bitmap_clear_bit (components, SP_REGNUM);
3423
3424   return components;
3425 }
3426
3427 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3428
3429 static sbitmap
3430 aarch64_components_for_bb (basic_block bb)
3431 {
3432   bitmap in = DF_LIVE_IN (bb);
3433   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3434   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3435
3436   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3437   bitmap_clear (components);
3438
3439   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3440   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3441     if ((!call_used_regs[regno])
3442        && (bitmap_bit_p (in, regno)
3443            || bitmap_bit_p (gen, regno)
3444            || bitmap_bit_p (kill, regno)))
3445           bitmap_set_bit (components, regno);
3446
3447   return components;
3448 }
3449
3450 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3451    Nothing to do for aarch64.  */
3452
3453 static void
3454 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3455 {
3456 }
3457
3458 /* Return the next set bit in BMP from START onwards.  Return the total number
3459    of bits in BMP if no set bit is found at or after START.  */
3460
3461 static unsigned int
3462 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3463 {
3464   unsigned int nbits = SBITMAP_SIZE (bmp);
3465   if (start == nbits)
3466     return start;
3467
3468   gcc_assert (start < nbits);
3469   for (unsigned int i = start; i < nbits; i++)
3470     if (bitmap_bit_p (bmp, i))
3471       return i;
3472
3473   return nbits;
3474 }
3475
3476 /* Do the work for aarch64_emit_prologue_components and
3477    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3478    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3479    for these components or the epilogue sequence.  That is, it determines
3480    whether we should emit stores or loads and what kind of CFA notes to attach
3481    to the insns.  Otherwise the logic for the two sequences is very
3482    similar.  */
3483
3484 static void
3485 aarch64_process_components (sbitmap components, bool prologue_p)
3486 {
3487   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3488                              ? HARD_FRAME_POINTER_REGNUM
3489                              : STACK_POINTER_REGNUM);
3490
3491   unsigned last_regno = SBITMAP_SIZE (components);
3492   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3493   rtx_insn *insn = NULL;
3494
3495   while (regno != last_regno)
3496     {
3497       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3498          so DFmode for the vector registers is enough.  */
3499       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3500       rtx reg = gen_rtx_REG (mode, regno);
3501       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3502       if (!frame_pointer_needed)
3503         offset += cfun->machine->frame.frame_size
3504                   - cfun->machine->frame.hard_fp_offset;
3505       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3506       rtx mem = gen_frame_mem (mode, addr);
3507
3508       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3509       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3510       /* No more registers to handle after REGNO.
3511          Emit a single save/restore and exit.  */
3512       if (regno2 == last_regno)
3513         {
3514           insn = emit_insn (set);
3515           RTX_FRAME_RELATED_P (insn) = 1;
3516           if (prologue_p)
3517             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3518           else
3519             add_reg_note (insn, REG_CFA_RESTORE, reg);
3520           break;
3521         }
3522
3523       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3524       /* The next register is not of the same class or its offset is not
3525          mergeable with the current one into a pair.  */
3526       if (!satisfies_constraint_Ump (mem)
3527           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3528           || (offset2 - cfun->machine->frame.reg_offset[regno])
3529                 != GET_MODE_SIZE (mode))
3530         {
3531           insn = emit_insn (set);
3532           RTX_FRAME_RELATED_P (insn) = 1;
3533           if (prologue_p)
3534             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3535           else
3536             add_reg_note (insn, REG_CFA_RESTORE, reg);
3537
3538           regno = regno2;
3539           continue;
3540         }
3541
3542       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3543       rtx reg2 = gen_rtx_REG (mode, regno2);
3544       if (!frame_pointer_needed)
3545         offset2 += cfun->machine->frame.frame_size
3546                   - cfun->machine->frame.hard_fp_offset;
3547       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3548       rtx mem2 = gen_frame_mem (mode, addr2);
3549       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3550                              : gen_rtx_SET (reg2, mem2);
3551
3552       if (prologue_p)
3553         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3554       else
3555         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3556
3557       RTX_FRAME_RELATED_P (insn) = 1;
3558       if (prologue_p)
3559         {
3560           add_reg_note (insn, REG_CFA_OFFSET, set);
3561           add_reg_note (insn, REG_CFA_OFFSET, set2);
3562         }
3563       else
3564         {
3565           add_reg_note (insn, REG_CFA_RESTORE, reg);
3566           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3567         }
3568
3569       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3570     }
3571 }
3572
3573 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3574
3575 static void
3576 aarch64_emit_prologue_components (sbitmap components)
3577 {
3578   aarch64_process_components (components, true);
3579 }
3580
3581 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3582
3583 static void
3584 aarch64_emit_epilogue_components (sbitmap components)
3585 {
3586   aarch64_process_components (components, false);
3587 }
3588
3589 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3590
3591 static void
3592 aarch64_set_handled_components (sbitmap components)
3593 {
3594   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3595     if (bitmap_bit_p (components, regno))
3596       cfun->machine->reg_is_wrapped_separately[regno] = true;
3597 }
3598
3599 /* AArch64 stack frames generated by this compiler look like:
3600
3601         +-------------------------------+
3602         |                               |
3603         |  incoming stack arguments     |
3604         |                               |
3605         +-------------------------------+
3606         |                               | <-- incoming stack pointer (aligned)
3607         |  callee-allocated save area   |
3608         |  for register varargs         |
3609         |                               |
3610         +-------------------------------+
3611         |  local variables              | <-- frame_pointer_rtx
3612         |                               |
3613         +-------------------------------+
3614         |  padding0                     | \
3615         +-------------------------------+  |
3616         |  callee-saved registers       |  | frame.saved_regs_size
3617         +-------------------------------+  |
3618         |  LR'                          |  |
3619         +-------------------------------+  |
3620         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3621         +-------------------------------+
3622         |  dynamic allocation           |
3623         +-------------------------------+
3624         |  padding                      |
3625         +-------------------------------+
3626         |  outgoing stack arguments     | <-- arg_pointer
3627         |                               |
3628         +-------------------------------+
3629         |                               | <-- stack_pointer_rtx (aligned)
3630
3631    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3632    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3633    unchanged.  */
3634
3635 /* Generate the prologue instructions for entry into a function.
3636    Establish the stack frame by decreasing the stack pointer with a
3637    properly calculated size and, if necessary, create a frame record
3638    filled with the values of LR and previous frame pointer.  The
3639    current FP is also set up if it is in use.  */
3640
3641 void
3642 aarch64_expand_prologue (void)
3643 {
3644   aarch64_layout_frame ();
3645
3646   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3647   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3648   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3649   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3650   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3651   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3652   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3653   rtx_insn *insn;
3654
3655   /* Sign return address for functions.  */
3656   if (aarch64_return_address_signing_enabled ())
3657     {
3658       insn = emit_insn (gen_pacisp ());
3659       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3660       RTX_FRAME_RELATED_P (insn) = 1;
3661     }
3662
3663   if (flag_stack_usage_info)
3664     current_function_static_stack_size = frame_size;
3665
3666   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3667     {
3668       if (crtl->is_leaf && !cfun->calls_alloca)
3669         {
3670           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3671             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3672                                             frame_size - STACK_CHECK_PROTECT);
3673         }
3674       else if (frame_size > 0)
3675         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3676     }
3677
3678   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3679
3680   if (callee_adjust != 0)
3681     aarch64_push_regs (reg1, reg2, callee_adjust);
3682
3683   if (frame_pointer_needed)
3684     {
3685       if (callee_adjust == 0)
3686         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3687                                    R30_REGNUM, false);
3688       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3689                                        stack_pointer_rtx,
3690                                        GEN_INT (callee_offset)));
3691       RTX_FRAME_RELATED_P (insn) = 1;
3692       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3693     }
3694
3695   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3696                              callee_adjust != 0 || frame_pointer_needed);
3697   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3698                              callee_adjust != 0 || frame_pointer_needed);
3699   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3700 }
3701
3702 /* Return TRUE if we can use a simple_return insn.
3703
3704    This function checks whether the callee saved stack is empty, which
3705    means no restore actions are need. The pro_and_epilogue will use
3706    this to check whether shrink-wrapping opt is feasible.  */
3707
3708 bool
3709 aarch64_use_return_insn_p (void)
3710 {
3711   if (!reload_completed)
3712     return false;
3713
3714   if (crtl->profile)
3715     return false;
3716
3717   aarch64_layout_frame ();
3718
3719   return cfun->machine->frame.frame_size == 0;
3720 }
3721
3722 /* Generate the epilogue instructions for returning from a function.
3723    This is almost exactly the reverse of the prolog sequence, except
3724    that we need to insert barriers to avoid scheduling loads that read
3725    from a deallocated stack, and we optimize the unwind records by
3726    emitting them all together if possible.  */
3727 void
3728 aarch64_expand_epilogue (bool for_sibcall)
3729 {
3730   aarch64_layout_frame ();
3731
3732   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3733   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3734   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3735   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3736   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3737   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3738   rtx cfi_ops = NULL;
3739   rtx_insn *insn;
3740
3741   /* We need to add memory barrier to prevent read from deallocated stack.  */
3742   bool need_barrier_p = (get_frame_size ()
3743                          + cfun->machine->frame.saved_varargs_size) != 0;
3744
3745   /* Emit a barrier to prevent loads from a deallocated stack.  */
3746   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3747       || crtl->calls_eh_return)
3748     {
3749       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3750       need_barrier_p = false;
3751     }
3752
3753   /* Restore the stack pointer from the frame pointer if it may not
3754      be the same as the stack pointer.  */
3755   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3756     {
3757       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3758                                        hard_frame_pointer_rtx,
3759                                        GEN_INT (-callee_offset)));
3760       /* If writeback is used when restoring callee-saves, the CFA
3761          is restored on the instruction doing the writeback.  */
3762       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3763     }
3764   else
3765     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3766
3767   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3768                                 callee_adjust != 0, &cfi_ops);
3769   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3770                                 callee_adjust != 0, &cfi_ops);
3771
3772   if (need_barrier_p)
3773     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3774
3775   if (callee_adjust != 0)
3776     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3777
3778   if (callee_adjust != 0 || initial_adjust > 65536)
3779     {
3780       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3781       insn = get_last_insn ();
3782       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3783       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3784       RTX_FRAME_RELATED_P (insn) = 1;
3785       cfi_ops = NULL;
3786     }
3787
3788   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3789
3790   if (cfi_ops)
3791     {
3792       /* Emit delayed restores and reset the CFA to be SP.  */
3793       insn = get_last_insn ();
3794       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3795       REG_NOTES (insn) = cfi_ops;
3796       RTX_FRAME_RELATED_P (insn) = 1;
3797     }
3798
3799   /* We prefer to emit the combined return/authenticate instruction RETAA,
3800      however there are three cases in which we must instead emit an explicit
3801      authentication instruction.
3802
3803         1) Sibcalls don't return in a normal way, so if we're about to call one
3804            we must authenticate.
3805
3806         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3807            generating code for !TARGET_ARMV8_3 we can't use it and must
3808            explicitly authenticate.
3809
3810         3) On an eh_return path we make extra stack adjustments to update the
3811            canonical frame address to be the exception handler's CFA.  We want
3812            to authenticate using the CFA of the function which calls eh_return.
3813     */
3814   if (aarch64_return_address_signing_enabled ()
3815       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3816     {
3817       insn = emit_insn (gen_autisp ());
3818       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3819       RTX_FRAME_RELATED_P (insn) = 1;
3820     }
3821
3822   /* Stack adjustment for exception handler.  */
3823   if (crtl->calls_eh_return)
3824     {
3825       /* We need to unwind the stack by the offset computed by
3826          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3827          to be SP; letting the CFA move during this adjustment
3828          is just as correct as retaining the CFA from the body
3829          of the function.  Therefore, do nothing special.  */
3830       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3831     }
3832
3833   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3834   if (!for_sibcall)
3835     emit_jump_insn (ret_rtx);
3836 }
3837
3838 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3839    normally or return to a previous frame after unwinding.
3840
3841    An EH return uses a single shared return sequence.  The epilogue is
3842    exactly like a normal epilogue except that it has an extra input
3843    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3844    that must be applied after the frame has been destroyed.  An extra label
3845    is inserted before the epilogue which initializes this register to zero,
3846    and this is the entry point for a normal return.
3847
3848    An actual EH return updates the return address, initializes the stack
3849    adjustment and jumps directly into the epilogue (bypassing the zeroing
3850    of the adjustment).  Since the return address is typically saved on the
3851    stack when a function makes a call, the saved LR must be updated outside
3852    the epilogue.
3853
3854    This poses problems as the store is generated well before the epilogue,
3855    so the offset of LR is not known yet.  Also optimizations will remove the
3856    store as it appears dead, even after the epilogue is generated (as the
3857    base or offset for loading LR is different in many cases).
3858
3859    To avoid these problems this implementation forces the frame pointer
3860    in eh_return functions so that the location of LR is fixed and known early.
3861    It also marks the store volatile, so no optimization is permitted to
3862    remove the store.  */
3863 rtx
3864 aarch64_eh_return_handler_rtx (void)
3865 {
3866   rtx tmp = gen_frame_mem (Pmode,
3867     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3868
3869   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3870   MEM_VOLATILE_P (tmp) = true;
3871   return tmp;
3872 }
3873
3874 /* Output code to add DELTA to the first argument, and then jump
3875    to FUNCTION.  Used for C++ multiple inheritance.  */
3876 static void
3877 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3878                          HOST_WIDE_INT delta,
3879                          HOST_WIDE_INT vcall_offset,
3880                          tree function)
3881 {
3882   /* The this pointer is always in x0.  Note that this differs from
3883      Arm where the this pointer maybe bumped to r1 if r0 is required
3884      to return a pointer to an aggregate.  On AArch64 a result value
3885      pointer will be in x8.  */
3886   int this_regno = R0_REGNUM;
3887   rtx this_rtx, temp0, temp1, addr, funexp;
3888   rtx_insn *insn;
3889
3890   reload_completed = 1;
3891   emit_note (NOTE_INSN_PROLOGUE_END);
3892
3893   if (vcall_offset == 0)
3894     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3895   else
3896     {
3897       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3898
3899       this_rtx = gen_rtx_REG (Pmode, this_regno);
3900       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3901       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3902
3903       addr = this_rtx;
3904       if (delta != 0)
3905         {
3906           if (delta >= -256 && delta < 256)
3907             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3908                                        plus_constant (Pmode, this_rtx, delta));
3909           else
3910             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3911         }
3912
3913       if (Pmode == ptr_mode)
3914         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3915       else
3916         aarch64_emit_move (temp0,
3917                            gen_rtx_ZERO_EXTEND (Pmode,
3918                                                 gen_rtx_MEM (ptr_mode, addr)));
3919
3920       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3921           addr = plus_constant (Pmode, temp0, vcall_offset);
3922       else
3923         {
3924           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3925                                           Pmode);
3926           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3927         }
3928
3929       if (Pmode == ptr_mode)
3930         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3931       else
3932         aarch64_emit_move (temp1,
3933                            gen_rtx_SIGN_EXTEND (Pmode,
3934                                                 gen_rtx_MEM (ptr_mode, addr)));
3935
3936       emit_insn (gen_add2_insn (this_rtx, temp1));
3937     }
3938
3939   /* Generate a tail call to the target function.  */
3940   if (!TREE_USED (function))
3941     {
3942       assemble_external (function);
3943       TREE_USED (function) = 1;
3944     }
3945   funexp = XEXP (DECL_RTL (function), 0);
3946   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3947   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3948   SIBLING_CALL_P (insn) = 1;
3949
3950   insn = get_insns ();
3951   shorten_branches (insn);
3952   final_start_function (insn, file, 1);
3953   final (insn, file, 1);
3954   final_end_function ();
3955
3956   /* Stop pretending to be a post-reload pass.  */
3957   reload_completed = 0;
3958 }
3959
3960 static bool
3961 aarch64_tls_referenced_p (rtx x)
3962 {
3963   if (!TARGET_HAVE_TLS)
3964     return false;
3965   subrtx_iterator::array_type array;
3966   FOR_EACH_SUBRTX (iter, array, x, ALL)
3967     {
3968       const_rtx x = *iter;
3969       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3970         return true;
3971       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3972          TLS offsets, not real symbol references.  */
3973       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3974         iter.skip_subrtxes ();
3975     }
3976   return false;
3977 }
3978
3979
3980 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3981    a left shift of 0 or 12 bits.  */
3982 bool
3983 aarch64_uimm12_shift (HOST_WIDE_INT val)
3984 {
3985   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3986           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3987           );
3988 }
3989
3990
3991 /* Return true if val is an immediate that can be loaded into a
3992    register by a MOVZ instruction.  */
3993 static bool
3994 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
3995 {
3996   if (GET_MODE_SIZE (mode) > 4)
3997     {
3998       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3999           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4000         return 1;
4001     }
4002   else
4003     {
4004       /* Ignore sign extension.  */
4005       val &= (HOST_WIDE_INT) 0xffffffff;
4006     }
4007   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4008           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4009 }
4010
4011 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4012
4013 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4014   {
4015     0x0000000100000001ull,
4016     0x0001000100010001ull,
4017     0x0101010101010101ull,
4018     0x1111111111111111ull,
4019     0x5555555555555555ull,
4020   };
4021
4022
4023 /* Return true if val is a valid bitmask immediate.  */
4024
4025 bool
4026 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4027 {
4028   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4029   int bits;
4030
4031   /* Check for a single sequence of one bits and return quickly if so.
4032      The special cases of all ones and all zeroes returns false.  */
4033   val = (unsigned HOST_WIDE_INT) val_in;
4034   tmp = val + (val & -val);
4035
4036   if (tmp == (tmp & -tmp))
4037     return (val + 1) > 1;
4038
4039   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4040   if (mode == SImode)
4041     val = (val << 32) | (val & 0xffffffff);
4042
4043   /* Invert if the immediate doesn't start with a zero bit - this means we
4044      only need to search for sequences of one bits.  */
4045   if (val & 1)
4046     val = ~val;
4047
4048   /* Find the first set bit and set tmp to val with the first sequence of one
4049      bits removed.  Return success if there is a single sequence of ones.  */
4050   first_one = val & -val;
4051   tmp = val & (val + first_one);
4052
4053   if (tmp == 0)
4054     return true;
4055
4056   /* Find the next set bit and compute the difference in bit position.  */
4057   next_one = tmp & -tmp;
4058   bits = clz_hwi (first_one) - clz_hwi (next_one);
4059   mask = val ^ tmp;
4060
4061   /* Check the bit position difference is a power of 2, and that the first
4062      sequence of one bits fits within 'bits' bits.  */
4063   if ((mask >> bits) != 0 || bits != (bits & -bits))
4064     return false;
4065
4066   /* Check the sequence of one bits is repeated 64/bits times.  */
4067   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4068 }
4069
4070 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4071    Assumed precondition: VAL_IN Is not zero.  */
4072
4073 unsigned HOST_WIDE_INT
4074 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4075 {
4076   int lowest_bit_set = ctz_hwi (val_in);
4077   int highest_bit_set = floor_log2 (val_in);
4078   gcc_assert (val_in != 0);
4079
4080   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4081           (HOST_WIDE_INT_1U << lowest_bit_set));
4082 }
4083
4084 /* Create constant where bits outside of lowest bit set to highest bit set
4085    are set to 1.  */
4086
4087 unsigned HOST_WIDE_INT
4088 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4089 {
4090   return val_in | ~aarch64_and_split_imm1 (val_in);
4091 }
4092
4093 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4094
4095 bool
4096 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4097 {
4098   scalar_int_mode int_mode;
4099   if (!is_a <scalar_int_mode> (mode, &int_mode))
4100     return false;
4101
4102   if (aarch64_bitmask_imm (val_in, int_mode))
4103     return false;
4104
4105   if (aarch64_move_imm (val_in, int_mode))
4106     return false;
4107
4108   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4109
4110   return aarch64_bitmask_imm (imm2, int_mode);
4111 }
4112
4113 /* Return true if val is an immediate that can be loaded into a
4114    register in a single instruction.  */
4115 bool
4116 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4117 {
4118   scalar_int_mode int_mode;
4119   if (!is_a <scalar_int_mode> (mode, &int_mode))
4120     return false;
4121
4122   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4123     return 1;
4124   return aarch64_bitmask_imm (val, int_mode);
4125 }
4126
4127 static bool
4128 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4129 {
4130   rtx base, offset;
4131
4132   if (GET_CODE (x) == HIGH)
4133     return true;
4134
4135   split_const (x, &base, &offset);
4136   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4137     {
4138       if (aarch64_classify_symbol (base, offset)
4139           != SYMBOL_FORCE_TO_MEM)
4140         return true;
4141       else
4142         /* Avoid generating a 64-bit relocation in ILP32; leave
4143            to aarch64_expand_mov_immediate to handle it properly.  */
4144         return mode != ptr_mode;
4145     }
4146
4147   return aarch64_tls_referenced_p (x);
4148 }
4149
4150 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4151    The expansion for a table switch is quite expensive due to the number
4152    of instructions, the table lookup and hard to predict indirect jump.
4153    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4154    set, otherwise use tables for > 16 cases as a tradeoff between size and
4155    performance.  When optimizing for size, use the default setting.  */
4156
4157 static unsigned int
4158 aarch64_case_values_threshold (void)
4159 {
4160   /* Use the specified limit for the number of cases before using jump
4161      tables at higher optimization levels.  */
4162   if (optimize > 2
4163       && selected_cpu->tune->max_case_values != 0)
4164     return selected_cpu->tune->max_case_values;
4165   else
4166     return optimize_size ? default_case_values_threshold () : 17;
4167 }
4168
4169 /* Return true if register REGNO is a valid index register.
4170    STRICT_P is true if REG_OK_STRICT is in effect.  */
4171
4172 bool
4173 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4174 {
4175   if (!HARD_REGISTER_NUM_P (regno))
4176     {
4177       if (!strict_p)
4178         return true;
4179
4180       if (!reg_renumber)
4181         return false;
4182
4183       regno = reg_renumber[regno];
4184     }
4185   return GP_REGNUM_P (regno);
4186 }
4187
4188 /* Return true if register REGNO is a valid base register for mode MODE.
4189    STRICT_P is true if REG_OK_STRICT is in effect.  */
4190
4191 bool
4192 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4193 {
4194   if (!HARD_REGISTER_NUM_P (regno))
4195     {
4196       if (!strict_p)
4197         return true;
4198
4199       if (!reg_renumber)
4200         return false;
4201
4202       regno = reg_renumber[regno];
4203     }
4204
4205   /* The fake registers will be eliminated to either the stack or
4206      hard frame pointer, both of which are usually valid base registers.
4207      Reload deals with the cases where the eliminated form isn't valid.  */
4208   return (GP_REGNUM_P (regno)
4209           || regno == SP_REGNUM
4210           || regno == FRAME_POINTER_REGNUM
4211           || regno == ARG_POINTER_REGNUM);
4212 }
4213
4214 /* Return true if X is a valid base register for mode MODE.
4215    STRICT_P is true if REG_OK_STRICT is in effect.  */
4216
4217 static bool
4218 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4219 {
4220   if (!strict_p
4221       && GET_CODE (x) == SUBREG
4222       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4223     x = SUBREG_REG (x);
4224
4225   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4226 }
4227
4228 /* Return true if address offset is a valid index.  If it is, fill in INFO
4229    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4230
4231 static bool
4232 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4233                         machine_mode mode, bool strict_p)
4234 {
4235   enum aarch64_address_type type;
4236   rtx index;
4237   int shift;
4238
4239   /* (reg:P) */
4240   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4241       && GET_MODE (x) == Pmode)
4242     {
4243       type = ADDRESS_REG_REG;
4244       index = x;
4245       shift = 0;
4246     }
4247   /* (sign_extend:DI (reg:SI)) */
4248   else if ((GET_CODE (x) == SIGN_EXTEND
4249             || GET_CODE (x) == ZERO_EXTEND)
4250            && GET_MODE (x) == DImode
4251            && GET_MODE (XEXP (x, 0)) == SImode)
4252     {
4253       type = (GET_CODE (x) == SIGN_EXTEND)
4254         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4255       index = XEXP (x, 0);
4256       shift = 0;
4257     }
4258   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4259   else if (GET_CODE (x) == MULT
4260            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4261                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4262            && GET_MODE (XEXP (x, 0)) == DImode
4263            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4264            && CONST_INT_P (XEXP (x, 1)))
4265     {
4266       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4267         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4268       index = XEXP (XEXP (x, 0), 0);
4269       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4270     }
4271   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4272   else if (GET_CODE (x) == ASHIFT
4273            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4274                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4275            && GET_MODE (XEXP (x, 0)) == DImode
4276            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4277            && CONST_INT_P (XEXP (x, 1)))
4278     {
4279       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4280         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4281       index = XEXP (XEXP (x, 0), 0);
4282       shift = INTVAL (XEXP (x, 1));
4283     }
4284   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4285   else if ((GET_CODE (x) == SIGN_EXTRACT
4286             || GET_CODE (x) == ZERO_EXTRACT)
4287            && GET_MODE (x) == DImode
4288            && GET_CODE (XEXP (x, 0)) == MULT
4289            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4290            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4291     {
4292       type = (GET_CODE (x) == SIGN_EXTRACT)
4293         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4294       index = XEXP (XEXP (x, 0), 0);
4295       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4296       if (INTVAL (XEXP (x, 1)) != 32 + shift
4297           || INTVAL (XEXP (x, 2)) != 0)
4298         shift = -1;
4299     }
4300   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4301      (const_int 0xffffffff<<shift)) */
4302   else if (GET_CODE (x) == AND
4303            && GET_MODE (x) == DImode
4304            && GET_CODE (XEXP (x, 0)) == MULT
4305            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4306            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4307            && CONST_INT_P (XEXP (x, 1)))
4308     {
4309       type = ADDRESS_REG_UXTW;
4310       index = XEXP (XEXP (x, 0), 0);
4311       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4312       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4313         shift = -1;
4314     }
4315   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4316   else if ((GET_CODE (x) == SIGN_EXTRACT
4317             || GET_CODE (x) == ZERO_EXTRACT)
4318            && GET_MODE (x) == DImode
4319            && GET_CODE (XEXP (x, 0)) == ASHIFT
4320            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4321            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4322     {
4323       type = (GET_CODE (x) == SIGN_EXTRACT)
4324         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4325       index = XEXP (XEXP (x, 0), 0);
4326       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4327       if (INTVAL (XEXP (x, 1)) != 32 + shift
4328           || INTVAL (XEXP (x, 2)) != 0)
4329         shift = -1;
4330     }
4331   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4332      (const_int 0xffffffff<<shift)) */
4333   else if (GET_CODE (x) == AND
4334            && GET_MODE (x) == DImode
4335            && GET_CODE (XEXP (x, 0)) == ASHIFT
4336            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4337            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4338            && CONST_INT_P (XEXP (x, 1)))
4339     {
4340       type = ADDRESS_REG_UXTW;
4341       index = XEXP (XEXP (x, 0), 0);
4342       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4343       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4344         shift = -1;
4345     }
4346   /* (mult:P (reg:P) (const_int scale)) */
4347   else if (GET_CODE (x) == MULT
4348            && GET_MODE (x) == Pmode
4349            && GET_MODE (XEXP (x, 0)) == Pmode
4350            && CONST_INT_P (XEXP (x, 1)))
4351     {
4352       type = ADDRESS_REG_REG;
4353       index = XEXP (x, 0);
4354       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4355     }
4356   /* (ashift:P (reg:P) (const_int shift)) */
4357   else if (GET_CODE (x) == ASHIFT
4358            && GET_MODE (x) == Pmode
4359            && GET_MODE (XEXP (x, 0)) == Pmode
4360            && CONST_INT_P (XEXP (x, 1)))
4361     {
4362       type = ADDRESS_REG_REG;
4363       index = XEXP (x, 0);
4364       shift = INTVAL (XEXP (x, 1));
4365     }
4366   else
4367     return false;
4368
4369   if (!strict_p
4370       && GET_CODE (index) == SUBREG
4371       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4372     index = SUBREG_REG (index);
4373
4374   if ((shift == 0 ||
4375        (shift > 0 && shift <= 3
4376         && (1 << shift) == GET_MODE_SIZE (mode)))
4377       && REG_P (index)
4378       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4379     {
4380       info->type = type;
4381       info->offset = index;
4382       info->shift = shift;
4383       return true;
4384     }
4385
4386   return false;
4387 }
4388
4389 /* Return true if MODE is one of the modes for which we
4390    support LDP/STP operations.  */
4391
4392 static bool
4393 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4394 {
4395   return mode == SImode || mode == DImode
4396          || mode == SFmode || mode == DFmode
4397          || (aarch64_vector_mode_supported_p (mode)
4398              && GET_MODE_SIZE (mode) == 8);
4399 }
4400
4401 /* Return true if REGNO is a virtual pointer register, or an eliminable
4402    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4403    include stack_pointer or hard_frame_pointer.  */
4404 static bool
4405 virt_or_elim_regno_p (unsigned regno)
4406 {
4407   return ((regno >= FIRST_VIRTUAL_REGISTER
4408            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4409           || regno == FRAME_POINTER_REGNUM
4410           || regno == ARG_POINTER_REGNUM);
4411 }
4412
4413 /* Return true if X is a valid address for machine mode MODE.  If it is,
4414    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4415    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4416
4417 static bool
4418 aarch64_classify_address (struct aarch64_address_info *info,
4419                           rtx x, machine_mode mode,
4420                           RTX_CODE outer_code, bool strict_p)
4421 {
4422   enum rtx_code code = GET_CODE (x);
4423   rtx op0, op1;
4424
4425   /* On BE, we use load/store pair for all large int mode load/stores.
4426      TI/TFmode may also use a load/store pair.  */
4427   bool load_store_pair_p = (outer_code == PARALLEL
4428                             || mode == TImode
4429                             || mode == TFmode
4430                             || (BYTES_BIG_ENDIAN
4431                                 && aarch64_vect_struct_mode_p (mode)));
4432
4433   bool allow_reg_index_p =
4434     !load_store_pair_p
4435     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4436     && !aarch64_vect_struct_mode_p (mode);
4437
4438   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4439      REG addressing.  */
4440   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4441       && (code != POST_INC && code != REG))
4442     return false;
4443
4444   switch (code)
4445     {
4446     case REG:
4447     case SUBREG:
4448       info->type = ADDRESS_REG_IMM;
4449       info->base = x;
4450       info->offset = const0_rtx;
4451       return aarch64_base_register_rtx_p (x, strict_p);
4452
4453     case PLUS:
4454       op0 = XEXP (x, 0);
4455       op1 = XEXP (x, 1);
4456
4457       if (! strict_p
4458           && REG_P (op0)
4459           && virt_or_elim_regno_p (REGNO (op0))
4460           && CONST_INT_P (op1))
4461         {
4462           info->type = ADDRESS_REG_IMM;
4463           info->base = op0;
4464           info->offset = op1;
4465
4466           return true;
4467         }
4468
4469       if (GET_MODE_SIZE (mode) != 0
4470           && CONST_INT_P (op1)
4471           && aarch64_base_register_rtx_p (op0, strict_p))
4472         {
4473           HOST_WIDE_INT offset = INTVAL (op1);
4474
4475           info->type = ADDRESS_REG_IMM;
4476           info->base = op0;
4477           info->offset = op1;
4478
4479           /* TImode and TFmode values are allowed in both pairs of X
4480              registers and individual Q registers.  The available
4481              address modes are:
4482              X,X: 7-bit signed scaled offset
4483              Q:   9-bit signed offset
4484              We conservatively require an offset representable in either mode.
4485              When performing the check for pairs of X registers i.e.  LDP/STP
4486              pass down DImode since that is the natural size of the LDP/STP
4487              instruction memory accesses.  */
4488           if (mode == TImode || mode == TFmode)
4489             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4490                     && (offset_9bit_signed_unscaled_p (mode, offset)
4491                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4492
4493           /* A 7bit offset check because OImode will emit a ldp/stp
4494              instruction (only big endian will get here).
4495              For ldp/stp instructions, the offset is scaled for the size of a
4496              single element of the pair.  */
4497           if (mode == OImode)
4498             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4499
4500           /* Three 9/12 bit offsets checks because CImode will emit three
4501              ldr/str instructions (only big endian will get here).  */
4502           if (mode == CImode)
4503             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4504                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4505                         || offset_12bit_unsigned_scaled_p (V16QImode,
4506                                                            offset + 32)));
4507
4508           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4509              instructions (only big endian will get here).  */
4510           if (mode == XImode)
4511             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4512                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4513                                                             offset + 32));
4514
4515           if (load_store_pair_p)
4516             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4517                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4518           else
4519             return (offset_9bit_signed_unscaled_p (mode, offset)
4520                     || offset_12bit_unsigned_scaled_p (mode, offset));
4521         }
4522
4523       if (allow_reg_index_p)
4524         {
4525           /* Look for base + (scaled/extended) index register.  */
4526           if (aarch64_base_register_rtx_p (op0, strict_p)
4527               && aarch64_classify_index (info, op1, mode, strict_p))
4528             {
4529               info->base = op0;
4530               return true;
4531             }
4532           if (aarch64_base_register_rtx_p (op1, strict_p)
4533               && aarch64_classify_index (info, op0, mode, strict_p))
4534             {
4535               info->base = op1;
4536               return true;
4537             }
4538         }
4539
4540       return false;
4541
4542     case POST_INC:
4543     case POST_DEC:
4544     case PRE_INC:
4545     case PRE_DEC:
4546       info->type = ADDRESS_REG_WB;
4547       info->base = XEXP (x, 0);
4548       info->offset = NULL_RTX;
4549       return aarch64_base_register_rtx_p (info->base, strict_p);
4550
4551     case POST_MODIFY:
4552     case PRE_MODIFY:
4553       info->type = ADDRESS_REG_WB;
4554       info->base = XEXP (x, 0);
4555       if (GET_CODE (XEXP (x, 1)) == PLUS
4556           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4557           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4558           && aarch64_base_register_rtx_p (info->base, strict_p))
4559         {
4560           HOST_WIDE_INT offset;
4561           info->offset = XEXP (XEXP (x, 1), 1);
4562           offset = INTVAL (info->offset);
4563
4564           /* TImode and TFmode values are allowed in both pairs of X
4565              registers and individual Q registers.  The available
4566              address modes are:
4567              X,X: 7-bit signed scaled offset
4568              Q:   9-bit signed offset
4569              We conservatively require an offset representable in either mode.
4570            */
4571           if (mode == TImode || mode == TFmode)
4572             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4573                     && offset_9bit_signed_unscaled_p (mode, offset));
4574
4575           if (load_store_pair_p)
4576             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4577                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4578           else
4579             return offset_9bit_signed_unscaled_p (mode, offset);
4580         }
4581       return false;
4582
4583     case CONST:
4584     case SYMBOL_REF:
4585     case LABEL_REF:
4586       /* load literal: pc-relative constant pool entry.  Only supported
4587          for SI mode or larger.  */
4588       info->type = ADDRESS_SYMBOLIC;
4589
4590       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4591         {
4592           rtx sym, addend;
4593
4594           split_const (x, &sym, &addend);
4595           return ((GET_CODE (sym) == LABEL_REF
4596                    || (GET_CODE (sym) == SYMBOL_REF
4597                        && CONSTANT_POOL_ADDRESS_P (sym)
4598                        && aarch64_pcrelative_literal_loads)));
4599         }
4600       return false;
4601
4602     case LO_SUM:
4603       info->type = ADDRESS_LO_SUM;
4604       info->base = XEXP (x, 0);
4605       info->offset = XEXP (x, 1);
4606       if (allow_reg_index_p
4607           && aarch64_base_register_rtx_p (info->base, strict_p))
4608         {
4609           rtx sym, offs;
4610           split_const (info->offset, &sym, &offs);
4611           if (GET_CODE (sym) == SYMBOL_REF
4612               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4613             {
4614               /* The symbol and offset must be aligned to the access size.  */
4615               unsigned int align;
4616               unsigned int ref_size;
4617
4618               if (CONSTANT_POOL_ADDRESS_P (sym))
4619                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4620               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4621                 {
4622                   tree exp = SYMBOL_REF_DECL (sym);
4623                   align = TYPE_ALIGN (TREE_TYPE (exp));
4624                   align = CONSTANT_ALIGNMENT (exp, align);
4625                 }
4626               else if (SYMBOL_REF_DECL (sym))
4627                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4628               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4629                        && SYMBOL_REF_BLOCK (sym) != NULL)
4630                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4631               else
4632                 align = BITS_PER_UNIT;
4633
4634               ref_size = GET_MODE_SIZE (mode);
4635               if (ref_size == 0)
4636                 ref_size = GET_MODE_SIZE (DImode);
4637
4638               return ((INTVAL (offs) & (ref_size - 1)) == 0
4639                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4640             }
4641         }
4642       return false;
4643
4644     default:
4645       return false;
4646     }
4647 }
4648
4649 /* Return true if the address X is valid for a PRFM instruction.
4650    STRICT_P is true if we should do strict checking with
4651    aarch64_classify_address.  */
4652
4653 bool
4654 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4655 {
4656   struct aarch64_address_info addr;
4657
4658   /* PRFM accepts the same addresses as DImode...  */
4659   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4660   if (!res)
4661     return false;
4662
4663   /* ... except writeback forms.  */
4664   return addr.type != ADDRESS_REG_WB;
4665 }
4666
4667 bool
4668 aarch64_symbolic_address_p (rtx x)
4669 {
4670   rtx offset;
4671
4672   split_const (x, &x, &offset);
4673   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4674 }
4675
4676 /* Classify the base of symbolic expression X.  */
4677
4678 enum aarch64_symbol_type
4679 aarch64_classify_symbolic_expression (rtx x)
4680 {
4681   rtx offset;
4682
4683   split_const (x, &x, &offset);
4684   return aarch64_classify_symbol (x, offset);
4685 }
4686
4687
4688 /* Return TRUE if X is a legitimate address for accessing memory in
4689    mode MODE.  */
4690 static bool
4691 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4692 {
4693   struct aarch64_address_info addr;
4694
4695   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4696 }
4697
4698 /* Return TRUE if X is a legitimate address for accessing memory in
4699    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4700    pair operation.  */
4701 bool
4702 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4703                               RTX_CODE outer_code, bool strict_p)
4704 {
4705   struct aarch64_address_info addr;
4706
4707   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4708 }
4709
4710 /* Split an out-of-range address displacement into a base and offset.
4711    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4712    to increase opportunities for sharing the base address of different sizes.
4713    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4714 static bool
4715 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4716 {
4717   HOST_WIDE_INT offset = INTVAL (*disp);
4718   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4719
4720   if (mode == TImode || mode == TFmode
4721       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4722     base = (offset + 0x100) & ~0x1ff;
4723
4724   *off = GEN_INT (base);
4725   *disp = GEN_INT (offset - base);
4726   return true;
4727 }
4728
4729 /* Return the binary representation of floating point constant VALUE in INTVAL.
4730    If the value cannot be converted, return false without setting INTVAL.
4731    The conversion is done in the given MODE.  */
4732 bool
4733 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4734 {
4735
4736   /* We make a general exception for 0.  */
4737   if (aarch64_float_const_zero_rtx_p (value))
4738     {
4739       *intval = 0;
4740       return true;
4741     }
4742
4743   machine_mode mode = GET_MODE (value);
4744   if (GET_CODE (value) != CONST_DOUBLE
4745       || !SCALAR_FLOAT_MODE_P (mode)
4746       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4747       /* Only support up to DF mode.  */
4748       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4749     return false;
4750
4751   unsigned HOST_WIDE_INT ival = 0;
4752
4753   long res[2];
4754   real_to_target (res,
4755                   CONST_DOUBLE_REAL_VALUE (value),
4756                   REAL_MODE_FORMAT (mode));
4757
4758   if (mode == DFmode)
4759     {
4760       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4761       ival = zext_hwi (res[order], 32);
4762       ival |= (zext_hwi (res[1 - order], 32) << 32);
4763     }
4764   else
4765       ival = zext_hwi (res[0], 32);
4766
4767   *intval = ival;
4768   return true;
4769 }
4770
4771 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4772    single MOV(+MOVK) followed by an FMOV.  */
4773 bool
4774 aarch64_float_const_rtx_p (rtx x)
4775 {
4776   machine_mode mode = GET_MODE (x);
4777   if (mode == VOIDmode)
4778     return false;
4779
4780   /* Determine whether it's cheaper to write float constants as
4781      mov/movk pairs over ldr/adrp pairs.  */
4782   unsigned HOST_WIDE_INT ival;
4783
4784   if (GET_CODE (x) == CONST_DOUBLE
4785       && SCALAR_FLOAT_MODE_P (mode)
4786       && aarch64_reinterpret_float_as_int (x, &ival))
4787     {
4788       scalar_int_mode imode = (mode == HFmode
4789                                ? SImode
4790                                : int_mode_for_mode (mode).require ());
4791       int num_instr = aarch64_internal_mov_immediate
4792                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4793       return num_instr < 3;
4794     }
4795
4796   return false;
4797 }
4798
4799 /* Return TRUE if rtx X is immediate constant 0.0 */
4800 bool
4801 aarch64_float_const_zero_rtx_p (rtx x)
4802 {
4803   if (GET_MODE (x) == VOIDmode)
4804     return false;
4805
4806   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4807     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4808   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4809 }
4810
4811 /* Return TRUE if rtx X is immediate constant that fits in a single
4812    MOVI immediate operation.  */
4813 bool
4814 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4815 {
4816   if (!TARGET_SIMD)
4817      return false;
4818
4819   machine_mode vmode;
4820   scalar_int_mode imode;
4821   unsigned HOST_WIDE_INT ival;
4822
4823   if (GET_CODE (x) == CONST_DOUBLE
4824       && SCALAR_FLOAT_MODE_P (mode))
4825     {
4826       if (!aarch64_reinterpret_float_as_int (x, &ival))
4827         return false;
4828
4829       /* We make a general exception for 0.  */
4830       if (aarch64_float_const_zero_rtx_p (x))
4831         return true;
4832
4833       imode = int_mode_for_mode (mode).require ();
4834     }
4835   else if (GET_CODE (x) == CONST_INT
4836            && is_a <scalar_int_mode> (mode, &imode))
4837     ival = INTVAL (x);
4838   else
4839     return false;
4840
4841    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4842      a 128 bit vector mode.  */
4843   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4844
4845   vmode = aarch64_simd_container_mode (imode, width);
4846   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4847
4848   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4849 }
4850
4851
4852 /* Return the fixed registers used for condition codes.  */
4853
4854 static bool
4855 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4856 {
4857   *p1 = CC_REGNUM;
4858   *p2 = INVALID_REGNUM;
4859   return true;
4860 }
4861
4862 /* This function is used by the call expanders of the machine description.
4863    RESULT is the register in which the result is returned.  It's NULL for
4864    "call" and "sibcall".
4865    MEM is the location of the function call.
4866    SIBCALL indicates whether this function call is normal call or sibling call.
4867    It will generate different pattern accordingly.  */
4868
4869 void
4870 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4871 {
4872   rtx call, callee, tmp;
4873   rtvec vec;
4874   machine_mode mode;
4875
4876   gcc_assert (MEM_P (mem));
4877   callee = XEXP (mem, 0);
4878   mode = GET_MODE (callee);
4879   gcc_assert (mode == Pmode);
4880
4881   /* Decide if we should generate indirect calls by loading the
4882      address of the callee into a register before performing
4883      the branch-and-link.  */
4884   if (SYMBOL_REF_P (callee)
4885       ? (aarch64_is_long_call_p (callee)
4886          || aarch64_is_noplt_call_p (callee))
4887       : !REG_P (callee))
4888     XEXP (mem, 0) = force_reg (mode, callee);
4889
4890   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4891
4892   if (result != NULL_RTX)
4893     call = gen_rtx_SET (result, call);
4894
4895   if (sibcall)
4896     tmp = ret_rtx;
4897   else
4898     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4899
4900   vec = gen_rtvec (2, call, tmp);
4901   call = gen_rtx_PARALLEL (VOIDmode, vec);
4902
4903   aarch64_emit_call_insn (call);
4904 }
4905
4906 /* Emit call insn with PAT and do aarch64-specific handling.  */
4907
4908 void
4909 aarch64_emit_call_insn (rtx pat)
4910 {
4911   rtx insn = emit_call_insn (pat);
4912
4913   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4914   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4915   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4916 }
4917
4918 machine_mode
4919 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4920 {
4921   /* All floating point compares return CCFP if it is an equality
4922      comparison, and CCFPE otherwise.  */
4923   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4924     {
4925       switch (code)
4926         {
4927         case EQ:
4928         case NE:
4929         case UNORDERED:
4930         case ORDERED:
4931         case UNLT:
4932         case UNLE:
4933         case UNGT:
4934         case UNGE:
4935         case UNEQ:
4936         case LTGT:
4937           return CCFPmode;
4938
4939         case LT:
4940         case LE:
4941         case GT:
4942         case GE:
4943           return CCFPEmode;
4944
4945         default:
4946           gcc_unreachable ();
4947         }
4948     }
4949
4950   /* Equality comparisons of short modes against zero can be performed
4951      using the TST instruction with the appropriate bitmask.  */
4952   if (y == const0_rtx && REG_P (x)
4953       && (code == EQ || code == NE)
4954       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4955     return CC_NZmode;
4956
4957   /* Similarly, comparisons of zero_extends from shorter modes can
4958      be performed using an ANDS with an immediate mask.  */
4959   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4960       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4961       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4962       && (code == EQ || code == NE))
4963     return CC_NZmode;
4964
4965   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4966       && y == const0_rtx
4967       && (code == EQ || code == NE || code == LT || code == GE)
4968       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4969           || GET_CODE (x) == NEG
4970           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4971               && CONST_INT_P (XEXP (x, 2)))))
4972     return CC_NZmode;
4973
4974   /* A compare with a shifted operand.  Because of canonicalization,
4975      the comparison will have to be swapped when we emit the assembly
4976      code.  */
4977   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4978       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4979       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4980           || GET_CODE (x) == LSHIFTRT
4981           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4982     return CC_SWPmode;
4983
4984   /* Similarly for a negated operand, but we can only do this for
4985      equalities.  */
4986   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4987       && (REG_P (y) || GET_CODE (y) == SUBREG)
4988       && (code == EQ || code == NE)
4989       && GET_CODE (x) == NEG)
4990     return CC_Zmode;
4991
4992   /* A test for unsigned overflow.  */
4993   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4994       && code == NE
4995       && GET_CODE (x) == PLUS
4996       && GET_CODE (y) == ZERO_EXTEND)
4997     return CC_Cmode;
4998
4999   /* For everything else, return CCmode.  */
5000   return CCmode;
5001 }
5002
5003 static int
5004 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5005
5006 int
5007 aarch64_get_condition_code (rtx x)
5008 {
5009   machine_mode mode = GET_MODE (XEXP (x, 0));
5010   enum rtx_code comp_code = GET_CODE (x);
5011
5012   if (GET_MODE_CLASS (mode) != MODE_CC)
5013     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5014   return aarch64_get_condition_code_1 (mode, comp_code);
5015 }
5016
5017 static int
5018 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5019 {
5020   switch (mode)
5021     {
5022     case E_CCFPmode:
5023     case E_CCFPEmode:
5024       switch (comp_code)
5025         {
5026         case GE: return AARCH64_GE;
5027         case GT: return AARCH64_GT;
5028         case LE: return AARCH64_LS;
5029         case LT: return AARCH64_MI;
5030         case NE: return AARCH64_NE;
5031         case EQ: return AARCH64_EQ;
5032         case ORDERED: return AARCH64_VC;
5033         case UNORDERED: return AARCH64_VS;
5034         case UNLT: return AARCH64_LT;
5035         case UNLE: return AARCH64_LE;
5036         case UNGT: return AARCH64_HI;
5037         case UNGE: return AARCH64_PL;
5038         default: return -1;
5039         }
5040       break;
5041
5042     case E_CCmode:
5043       switch (comp_code)
5044         {
5045         case NE: return AARCH64_NE;
5046         case EQ: return AARCH64_EQ;
5047         case GE: return AARCH64_GE;
5048         case GT: return AARCH64_GT;
5049         case LE: return AARCH64_LE;
5050         case LT: return AARCH64_LT;
5051         case GEU: return AARCH64_CS;
5052         case GTU: return AARCH64_HI;
5053         case LEU: return AARCH64_LS;
5054         case LTU: return AARCH64_CC;
5055         default: return -1;
5056         }
5057       break;
5058
5059     case E_CC_SWPmode:
5060       switch (comp_code)
5061         {
5062         case NE: return AARCH64_NE;
5063         case EQ: return AARCH64_EQ;
5064         case GE: return AARCH64_LE;
5065         case GT: return AARCH64_LT;
5066         case LE: return AARCH64_GE;
5067         case LT: return AARCH64_GT;
5068         case GEU: return AARCH64_LS;
5069         case GTU: return AARCH64_CC;
5070         case LEU: return AARCH64_CS;
5071         case LTU: return AARCH64_HI;
5072         default: return -1;
5073         }
5074       break;
5075
5076     case E_CC_NZmode:
5077       switch (comp_code)
5078         {
5079         case NE: return AARCH64_NE;
5080         case EQ: return AARCH64_EQ;
5081         case GE: return AARCH64_PL;
5082         case LT: return AARCH64_MI;
5083         default: return -1;
5084         }
5085       break;
5086
5087     case E_CC_Zmode:
5088       switch (comp_code)
5089         {
5090         case NE: return AARCH64_NE;
5091         case EQ: return AARCH64_EQ;
5092         default: return -1;
5093         }
5094       break;
5095
5096     case E_CC_Cmode:
5097       switch (comp_code)
5098         {
5099         case NE: return AARCH64_CS;
5100         case EQ: return AARCH64_CC;
5101         default: return -1;
5102         }
5103       break;
5104
5105     default:
5106       return -1;
5107     }
5108
5109   return -1;
5110 }
5111
5112 bool
5113 aarch64_const_vec_all_same_in_range_p (rtx x,
5114                                   HOST_WIDE_INT minval,
5115                                   HOST_WIDE_INT maxval)
5116 {
5117   HOST_WIDE_INT firstval;
5118   int count, i;
5119
5120   if (GET_CODE (x) != CONST_VECTOR
5121       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5122     return false;
5123
5124   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5125   if (firstval < minval || firstval > maxval)
5126     return false;
5127
5128   count = CONST_VECTOR_NUNITS (x);
5129   for (i = 1; i < count; i++)
5130     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5131       return false;
5132
5133   return true;
5134 }
5135
5136 bool
5137 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5138 {
5139   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5140 }
5141
5142
5143 /* N Z C V.  */
5144 #define AARCH64_CC_V 1
5145 #define AARCH64_CC_C (1 << 1)
5146 #define AARCH64_CC_Z (1 << 2)
5147 #define AARCH64_CC_N (1 << 3)
5148
5149 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5150 static const int aarch64_nzcv_codes[] =
5151 {
5152   0,            /* EQ, Z == 1.  */
5153   AARCH64_CC_Z, /* NE, Z == 0.  */
5154   0,            /* CS, C == 1.  */
5155   AARCH64_CC_C, /* CC, C == 0.  */
5156   0,            /* MI, N == 1.  */
5157   AARCH64_CC_N, /* PL, N == 0.  */
5158   0,            /* VS, V == 1.  */
5159   AARCH64_CC_V, /* VC, V == 0.  */
5160   0,            /* HI, C ==1 && Z == 0.  */
5161   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5162   AARCH64_CC_V, /* GE, N == V.  */
5163   0,            /* LT, N != V.  */
5164   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5165   0,            /* LE, !(Z == 0 && N == V).  */
5166   0,            /* AL, Any.  */
5167   0             /* NV, Any.  */
5168 };
5169
5170 /* Print operand X to file F in a target specific manner according to CODE.
5171    The acceptable formatting commands given by CODE are:
5172      'c':               An integer or symbol address without a preceding #
5173                         sign.
5174      'e':               Print the sign/zero-extend size as a character 8->b,
5175                         16->h, 32->w.
5176      'p':               Prints N such that 2^N == X (X must be power of 2 and
5177                         const int).
5178      'P':               Print the number of non-zero bits in X (a const_int).
5179      'H':               Print the higher numbered register of a pair (TImode)
5180                         of regs.
5181      'm':               Print a condition (eq, ne, etc).
5182      'M':               Same as 'm', but invert condition.
5183      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5184      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5185                         The register printed is the FP/SIMD register name
5186                         of X + 0/1/2/3 for S/T/U/V.
5187      'R':               Print a scalar FP/SIMD register name + 1.
5188      'X':               Print bottom 16 bits of integer constant in hex.
5189      'w/x':             Print a general register name or the zero register
5190                         (32-bit or 64-bit).
5191      '0':               Print a normal operand, if it's a general register,
5192                         then we assume DImode.
5193      'k':               Print NZCV for conditional compare instructions.
5194      'A':               Output address constant representing the first
5195                         argument of X, specifying a relocation offset
5196                         if appropriate.
5197      'L':               Output constant address specified by X
5198                         with a relocation offset if appropriate.
5199      'G':               Prints address of X, specifying a PC relative
5200                         relocation mode if appropriate.  */
5201
5202 static void
5203 aarch64_print_operand (FILE *f, rtx x, int code)
5204 {
5205   switch (code)
5206     {
5207     case 'c':
5208       switch (GET_CODE (x))
5209         {
5210         case CONST_INT:
5211           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5212           break;
5213
5214         case SYMBOL_REF:
5215           output_addr_const (f, x);
5216           break;
5217
5218         case CONST:
5219           if (GET_CODE (XEXP (x, 0)) == PLUS
5220               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5221             {
5222               output_addr_const (f, x);
5223               break;
5224             }
5225           /* Fall through.  */
5226
5227         default:
5228           output_operand_lossage ("Unsupported operand for code '%c'", code);
5229         }
5230       break;
5231
5232     case 'e':
5233       {
5234         int n;
5235
5236         if (!CONST_INT_P (x)
5237             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5238           {
5239             output_operand_lossage ("invalid operand for '%%%c'", code);
5240             return;
5241           }
5242
5243         switch (n)
5244           {
5245           case 3:
5246             fputc ('b', f);
5247             break;
5248           case 4:
5249             fputc ('h', f);
5250             break;
5251           case 5:
5252             fputc ('w', f);
5253             break;
5254           default:
5255             output_operand_lossage ("invalid operand for '%%%c'", code);
5256             return;
5257           }
5258       }
5259       break;
5260
5261     case 'p':
5262       {
5263         int n;
5264
5265         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5266           {
5267             output_operand_lossage ("invalid operand for '%%%c'", code);
5268             return;
5269           }
5270
5271         asm_fprintf (f, "%d", n);
5272       }
5273       break;
5274
5275     case 'P':
5276       if (!CONST_INT_P (x))
5277         {
5278           output_operand_lossage ("invalid operand for '%%%c'", code);
5279           return;
5280         }
5281
5282       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5283       break;
5284
5285     case 'H':
5286       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5287         {
5288           output_operand_lossage ("invalid operand for '%%%c'", code);
5289           return;
5290         }
5291
5292       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5293       break;
5294
5295     case 'M':
5296     case 'm':
5297       {
5298         int cond_code;
5299         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5300         if (x == const_true_rtx)
5301           {
5302             if (code == 'M')
5303               fputs ("nv", f);
5304             return;
5305           }
5306
5307         if (!COMPARISON_P (x))
5308           {
5309             output_operand_lossage ("invalid operand for '%%%c'", code);
5310             return;
5311           }
5312
5313         cond_code = aarch64_get_condition_code (x);
5314         gcc_assert (cond_code >= 0);
5315         if (code == 'M')
5316           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5317         fputs (aarch64_condition_codes[cond_code], f);
5318       }
5319       break;
5320
5321     case 'b':
5322     case 'h':
5323     case 's':
5324     case 'd':
5325     case 'q':
5326       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5327         {
5328           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5329           return;
5330         }
5331       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5332       break;
5333
5334     case 'S':
5335     case 'T':
5336     case 'U':
5337     case 'V':
5338       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5339         {
5340           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5341           return;
5342         }
5343       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5344       break;
5345
5346     case 'R':
5347       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5348         {
5349           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5350           return;
5351         }
5352       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5353       break;
5354
5355     case 'X':
5356       if (!CONST_INT_P (x))
5357         {
5358           output_operand_lossage ("invalid operand for '%%%c'", code);
5359           return;
5360         }
5361       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5362       break;
5363
5364     case 'w':
5365     case 'x':
5366       if (x == const0_rtx
5367           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5368         {
5369           asm_fprintf (f, "%czr", code);
5370           break;
5371         }
5372
5373       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5374         {
5375           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5376           break;
5377         }
5378
5379       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5380         {
5381           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5382           break;
5383         }
5384
5385       /* Fall through */
5386
5387     case 0:
5388       if (x == NULL)
5389         {
5390           output_operand_lossage ("missing operand");
5391           return;
5392         }
5393
5394       switch (GET_CODE (x))
5395         {
5396         case REG:
5397           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5398           break;
5399
5400         case MEM:
5401           output_address (GET_MODE (x), XEXP (x, 0));
5402           /* Check all memory references are Pmode - even with ILP32.  */
5403           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5404           break;
5405
5406         case CONST:
5407         case LABEL_REF:
5408         case SYMBOL_REF:
5409           output_addr_const (asm_out_file, x);
5410           break;
5411
5412         case CONST_INT:
5413           asm_fprintf (f, "%wd", INTVAL (x));
5414           break;
5415
5416         case CONST_VECTOR:
5417           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5418             {
5419               gcc_assert (
5420                   aarch64_const_vec_all_same_in_range_p (x,
5421                                                          HOST_WIDE_INT_MIN,
5422                                                          HOST_WIDE_INT_MAX));
5423               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5424             }
5425           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5426             {
5427               fputc ('0', f);
5428             }
5429           else
5430             gcc_unreachable ();
5431           break;
5432
5433         case CONST_DOUBLE:
5434           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5435              be getting CONST_DOUBLEs holding integers.  */
5436           gcc_assert (GET_MODE (x) != VOIDmode);
5437           if (aarch64_float_const_zero_rtx_p (x))
5438             {
5439               fputc ('0', f);
5440               break;
5441             }
5442           else if (aarch64_float_const_representable_p (x))
5443             {
5444 #define buf_size 20
5445               char float_buf[buf_size] = {'\0'};
5446               real_to_decimal_for_mode (float_buf,
5447                                         CONST_DOUBLE_REAL_VALUE (x),
5448                                         buf_size, buf_size,
5449                                         1, GET_MODE (x));
5450               asm_fprintf (asm_out_file, "%s", float_buf);
5451               break;
5452 #undef buf_size
5453             }
5454           output_operand_lossage ("invalid constant");
5455           return;
5456         default:
5457           output_operand_lossage ("invalid operand");
5458           return;
5459         }
5460       break;
5461
5462     case 'A':
5463       if (GET_CODE (x) == HIGH)
5464         x = XEXP (x, 0);
5465
5466       switch (aarch64_classify_symbolic_expression (x))
5467         {
5468         case SYMBOL_SMALL_GOT_4G:
5469           asm_fprintf (asm_out_file, ":got:");
5470           break;
5471
5472         case SYMBOL_SMALL_TLSGD:
5473           asm_fprintf (asm_out_file, ":tlsgd:");
5474           break;
5475
5476         case SYMBOL_SMALL_TLSDESC:
5477           asm_fprintf (asm_out_file, ":tlsdesc:");
5478           break;
5479
5480         case SYMBOL_SMALL_TLSIE:
5481           asm_fprintf (asm_out_file, ":gottprel:");
5482           break;
5483
5484         case SYMBOL_TLSLE24:
5485           asm_fprintf (asm_out_file, ":tprel:");
5486           break;
5487
5488         case SYMBOL_TINY_GOT:
5489           gcc_unreachable ();
5490           break;
5491
5492         default:
5493           break;
5494         }
5495       output_addr_const (asm_out_file, x);
5496       break;
5497
5498     case 'L':
5499       switch (aarch64_classify_symbolic_expression (x))
5500         {
5501         case SYMBOL_SMALL_GOT_4G:
5502           asm_fprintf (asm_out_file, ":lo12:");
5503           break;
5504
5505         case SYMBOL_SMALL_TLSGD:
5506           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5507           break;
5508
5509         case SYMBOL_SMALL_TLSDESC:
5510           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5511           break;
5512
5513         case SYMBOL_SMALL_TLSIE:
5514           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5515           break;
5516
5517         case SYMBOL_TLSLE12:
5518           asm_fprintf (asm_out_file, ":tprel_lo12:");
5519           break;
5520
5521         case SYMBOL_TLSLE24:
5522           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5523           break;
5524
5525         case SYMBOL_TINY_GOT:
5526           asm_fprintf (asm_out_file, ":got:");
5527           break;
5528
5529         case SYMBOL_TINY_TLSIE:
5530           asm_fprintf (asm_out_file, ":gottprel:");
5531           break;
5532
5533         default:
5534           break;
5535         }
5536       output_addr_const (asm_out_file, x);
5537       break;
5538
5539     case 'G':
5540       switch (aarch64_classify_symbolic_expression (x))
5541         {
5542         case SYMBOL_TLSLE24:
5543           asm_fprintf (asm_out_file, ":tprel_hi12:");
5544           break;
5545         default:
5546           break;
5547         }
5548       output_addr_const (asm_out_file, x);
5549       break;
5550
5551     case 'k':
5552       {
5553         HOST_WIDE_INT cond_code;
5554
5555         if (!CONST_INT_P (x))
5556           {
5557             output_operand_lossage ("invalid operand for '%%%c'", code);
5558             return;
5559           }
5560
5561         cond_code = INTVAL (x);
5562         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5563         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5564       }
5565       break;
5566
5567     default:
5568       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5569       return;
5570     }
5571 }
5572
5573 static void
5574 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5575 {
5576   struct aarch64_address_info addr;
5577
5578   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5579     switch (addr.type)
5580       {
5581       case ADDRESS_REG_IMM:
5582         if (addr.offset == const0_rtx)
5583           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5584         else
5585           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5586                        INTVAL (addr.offset));
5587         return;
5588
5589       case ADDRESS_REG_REG:
5590         if (addr.shift == 0)
5591           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5592                        reg_names [REGNO (addr.offset)]);
5593         else
5594           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5595                        reg_names [REGNO (addr.offset)], addr.shift);
5596         return;
5597
5598       case ADDRESS_REG_UXTW:
5599         if (addr.shift == 0)
5600           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5601                        REGNO (addr.offset) - R0_REGNUM);
5602         else
5603           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5604                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5605         return;
5606
5607       case ADDRESS_REG_SXTW:
5608         if (addr.shift == 0)
5609           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5610                        REGNO (addr.offset) - R0_REGNUM);
5611         else
5612           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5613                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5614         return;
5615
5616       case ADDRESS_REG_WB:
5617         switch (GET_CODE (x))
5618           {
5619           case PRE_INC:
5620             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5621                          GET_MODE_SIZE (mode));
5622             return;
5623           case POST_INC:
5624             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5625                          GET_MODE_SIZE (mode));
5626             return;
5627           case PRE_DEC:
5628             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5629                          GET_MODE_SIZE (mode));
5630             return;
5631           case POST_DEC:
5632             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5633                          GET_MODE_SIZE (mode));
5634             return;
5635           case PRE_MODIFY:
5636             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5637                          INTVAL (addr.offset));
5638             return;
5639           case POST_MODIFY:
5640             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5641                          INTVAL (addr.offset));
5642             return;
5643           default:
5644             break;
5645           }
5646         break;
5647
5648       case ADDRESS_LO_SUM:
5649         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5650         output_addr_const (f, addr.offset);
5651         asm_fprintf (f, "]");
5652         return;
5653
5654       case ADDRESS_SYMBOLIC:
5655         break;
5656       }
5657
5658   output_addr_const (f, x);
5659 }
5660
5661 bool
5662 aarch64_label_mentioned_p (rtx x)
5663 {
5664   const char *fmt;
5665   int i;
5666
5667   if (GET_CODE (x) == LABEL_REF)
5668     return true;
5669
5670   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5671      referencing instruction, but they are constant offsets, not
5672      symbols.  */
5673   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5674     return false;
5675
5676   fmt = GET_RTX_FORMAT (GET_CODE (x));
5677   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5678     {
5679       if (fmt[i] == 'E')
5680         {
5681           int j;
5682
5683           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5684             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5685               return 1;
5686         }
5687       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5688         return 1;
5689     }
5690
5691   return 0;
5692 }
5693
5694 /* Implement REGNO_REG_CLASS.  */
5695
5696 enum reg_class
5697 aarch64_regno_regclass (unsigned regno)
5698 {
5699   if (GP_REGNUM_P (regno))
5700     return GENERAL_REGS;
5701
5702   if (regno == SP_REGNUM)
5703     return STACK_REG;
5704
5705   if (regno == FRAME_POINTER_REGNUM
5706       || regno == ARG_POINTER_REGNUM)
5707     return POINTER_REGS;
5708
5709   if (FP_REGNUM_P (regno))
5710     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5711
5712   return NO_REGS;
5713 }
5714
5715 static rtx
5716 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5717 {
5718   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5719      where mask is selected by alignment and size of the offset.
5720      We try to pick as large a range for the offset as possible to
5721      maximize the chance of a CSE.  However, for aligned addresses
5722      we limit the range to 4k so that structures with different sized
5723      elements are likely to use the same base.  We need to be careful
5724      not to split a CONST for some forms of address expression, otherwise
5725      it will generate sub-optimal code.  */
5726
5727   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5728     {
5729       rtx base = XEXP (x, 0);
5730       rtx offset_rtx = XEXP (x, 1);
5731       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5732
5733       if (GET_CODE (base) == PLUS)
5734         {
5735           rtx op0 = XEXP (base, 0);
5736           rtx op1 = XEXP (base, 1);
5737
5738           /* Force any scaling into a temp for CSE.  */
5739           op0 = force_reg (Pmode, op0);
5740           op1 = force_reg (Pmode, op1);
5741
5742           /* Let the pointer register be in op0.  */
5743           if (REG_POINTER (op1))
5744             std::swap (op0, op1);
5745
5746           /* If the pointer is virtual or frame related, then we know that
5747              virtual register instantiation or register elimination is going
5748              to apply a second constant.  We want the two constants folded
5749              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5750           if (virt_or_elim_regno_p (REGNO (op0)))
5751             {
5752               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5753                                    NULL_RTX, true, OPTAB_DIRECT);
5754               return gen_rtx_PLUS (Pmode, base, op1);
5755             }
5756
5757           /* Otherwise, in order to encourage CSE (and thence loop strength
5758              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5759           base = expand_binop (Pmode, add_optab, op0, op1,
5760                                NULL_RTX, true, OPTAB_DIRECT);
5761           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5762         }
5763
5764       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5765       HOST_WIDE_INT base_offset;
5766       if (GET_MODE_SIZE (mode) > 16)
5767         base_offset = (offset + 0x400) & ~0x7f0;
5768       /* For offsets aren't a multiple of the access size, the limit is
5769          -256...255.  */
5770       else if (offset & (GET_MODE_SIZE (mode) - 1))
5771         {
5772           base_offset = (offset + 0x100) & ~0x1ff;
5773
5774           /* BLKmode typically uses LDP of X-registers.  */
5775           if (mode == BLKmode)
5776             base_offset = (offset + 512) & ~0x3ff;
5777         }
5778       /* Small negative offsets are supported.  */
5779       else if (IN_RANGE (offset, -256, 0))
5780         base_offset = 0;
5781       else if (mode == TImode || mode == TFmode)
5782         base_offset = (offset + 0x100) & ~0x1ff;
5783       /* Use 12-bit offset by access size.  */
5784       else
5785         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5786
5787       if (base_offset != 0)
5788         {
5789           base = plus_constant (Pmode, base, base_offset);
5790           base = force_operand (base, NULL_RTX);
5791           return plus_constant (Pmode, base, offset - base_offset);
5792         }
5793     }
5794
5795   return x;
5796 }
5797
5798 /* Return the reload icode required for a constant pool in mode.  */
5799 static enum insn_code
5800 aarch64_constant_pool_reload_icode (machine_mode mode)
5801 {
5802   switch (mode)
5803     {
5804     case E_SFmode:
5805       return CODE_FOR_aarch64_reload_movcpsfdi;
5806
5807     case E_DFmode:
5808       return CODE_FOR_aarch64_reload_movcpdfdi;
5809
5810     case E_TFmode:
5811       return CODE_FOR_aarch64_reload_movcptfdi;
5812
5813     case E_V8QImode:
5814       return CODE_FOR_aarch64_reload_movcpv8qidi;
5815
5816     case E_V16QImode:
5817       return CODE_FOR_aarch64_reload_movcpv16qidi;
5818
5819     case E_V4HImode:
5820       return CODE_FOR_aarch64_reload_movcpv4hidi;
5821
5822     case E_V8HImode:
5823       return CODE_FOR_aarch64_reload_movcpv8hidi;
5824
5825     case E_V2SImode:
5826       return CODE_FOR_aarch64_reload_movcpv2sidi;
5827
5828     case E_V4SImode:
5829       return CODE_FOR_aarch64_reload_movcpv4sidi;
5830
5831     case E_V2DImode:
5832       return CODE_FOR_aarch64_reload_movcpv2didi;
5833
5834     case E_V2DFmode:
5835       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5836
5837     default:
5838       gcc_unreachable ();
5839     }
5840
5841   gcc_unreachable ();
5842 }
5843 static reg_class_t
5844 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5845                           reg_class_t rclass,
5846                           machine_mode mode,
5847                           secondary_reload_info *sri)
5848 {
5849
5850   /* If we have to disable direct literal pool loads and stores because the
5851      function is too big, then we need a scratch register.  */
5852   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5853       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5854           || targetm.vector_mode_supported_p (GET_MODE (x)))
5855       && !aarch64_pcrelative_literal_loads)
5856     {
5857       sri->icode = aarch64_constant_pool_reload_icode (mode);
5858       return NO_REGS;
5859     }
5860
5861   /* Without the TARGET_SIMD instructions we cannot move a Q register
5862      to a Q register directly.  We need a scratch.  */
5863   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5864       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5865       && reg_class_subset_p (rclass, FP_REGS))
5866     {
5867       if (mode == TFmode)
5868         sri->icode = CODE_FOR_aarch64_reload_movtf;
5869       else if (mode == TImode)
5870         sri->icode = CODE_FOR_aarch64_reload_movti;
5871       return NO_REGS;
5872     }
5873
5874   /* A TFmode or TImode memory access should be handled via an FP_REGS
5875      because AArch64 has richer addressing modes for LDR/STR instructions
5876      than LDP/STP instructions.  */
5877   if (TARGET_FLOAT && rclass == GENERAL_REGS
5878       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5879     return FP_REGS;
5880
5881   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5882       return GENERAL_REGS;
5883
5884   return NO_REGS;
5885 }
5886
5887 static bool
5888 aarch64_can_eliminate (const int from, const int to)
5889 {
5890   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5891      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5892
5893   if (frame_pointer_needed)
5894     {
5895       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5896         return true;
5897       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5898         return false;
5899       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5900           && !cfun->calls_alloca)
5901         return true;
5902       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5903         return true;
5904
5905       return false;
5906     }
5907   else
5908     {
5909       /* If we decided that we didn't need a leaf frame pointer but then used
5910          LR in the function, then we'll want a frame pointer after all, so
5911          prevent this elimination to ensure a frame pointer is used.  */
5912       if (to == STACK_POINTER_REGNUM
5913           && flag_omit_leaf_frame_pointer
5914           && df_regs_ever_live_p (LR_REGNUM))
5915         return false;
5916     }
5917
5918   return true;
5919 }
5920
5921 HOST_WIDE_INT
5922 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5923 {
5924   aarch64_layout_frame ();
5925
5926   if (to == HARD_FRAME_POINTER_REGNUM)
5927     {
5928       if (from == ARG_POINTER_REGNUM)
5929         return cfun->machine->frame.hard_fp_offset;
5930
5931       if (from == FRAME_POINTER_REGNUM)
5932         return cfun->machine->frame.hard_fp_offset
5933                - cfun->machine->frame.locals_offset;
5934     }
5935
5936   if (to == STACK_POINTER_REGNUM)
5937     {
5938       if (from == FRAME_POINTER_REGNUM)
5939           return cfun->machine->frame.frame_size
5940                  - cfun->machine->frame.locals_offset;
5941     }
5942
5943   return cfun->machine->frame.frame_size;
5944 }
5945
5946 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5947    previous frame.  */
5948
5949 rtx
5950 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5951 {
5952   if (count != 0)
5953     return const0_rtx;
5954   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5955 }
5956
5957
5958 static void
5959 aarch64_asm_trampoline_template (FILE *f)
5960 {
5961   if (TARGET_ILP32)
5962     {
5963       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5964       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5965     }
5966   else
5967     {
5968       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5969       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5970     }
5971   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5972   assemble_aligned_integer (4, const0_rtx);
5973   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5974   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5975 }
5976
5977 static void
5978 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5979 {
5980   rtx fnaddr, mem, a_tramp;
5981   const int tramp_code_sz = 16;
5982
5983   /* Don't need to copy the trailing D-words, we fill those in below.  */
5984   emit_block_move (m_tramp, assemble_trampoline_template (),
5985                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5986   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5987   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5988   if (GET_MODE (fnaddr) != ptr_mode)
5989     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5990   emit_move_insn (mem, fnaddr);
5991
5992   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5993   emit_move_insn (mem, chain_value);
5994
5995   /* XXX We should really define a "clear_cache" pattern and use
5996      gen_clear_cache().  */
5997   a_tramp = XEXP (m_tramp, 0);
5998   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5999                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
6000                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6001                      ptr_mode);
6002 }
6003
6004 static unsigned char
6005 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6006 {
6007   switch (regclass)
6008     {
6009     case CALLER_SAVE_REGS:
6010     case POINTER_REGS:
6011     case GENERAL_REGS:
6012     case ALL_REGS:
6013     case FP_REGS:
6014     case FP_LO_REGS:
6015       return
6016         aarch64_vector_mode_p (mode)
6017           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6018           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6019     case STACK_REG:
6020       return 1;
6021
6022     case NO_REGS:
6023       return 0;
6024
6025     default:
6026       break;
6027     }
6028   gcc_unreachable ();
6029 }
6030
6031 static reg_class_t
6032 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6033 {
6034   if (regclass == POINTER_REGS)
6035     return GENERAL_REGS;
6036
6037   if (regclass == STACK_REG)
6038     {
6039       if (REG_P(x)
6040           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6041           return regclass;
6042
6043       return NO_REGS;
6044     }
6045
6046   /* Register eliminiation can result in a request for
6047      SP+constant->FP_REGS.  We cannot support such operations which
6048      use SP as source and an FP_REG as destination, so reject out
6049      right now.  */
6050   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6051     {
6052       rtx lhs = XEXP (x, 0);
6053
6054       /* Look through a possible SUBREG introduced by ILP32.  */
6055       if (GET_CODE (lhs) == SUBREG)
6056         lhs = SUBREG_REG (lhs);
6057
6058       gcc_assert (REG_P (lhs));
6059       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6060                                       POINTER_REGS));
6061       return NO_REGS;
6062     }
6063
6064   return regclass;
6065 }
6066
6067 void
6068 aarch64_asm_output_labelref (FILE* f, const char *name)
6069 {
6070   asm_fprintf (f, "%U%s", name);
6071 }
6072
6073 static void
6074 aarch64_elf_asm_constructor (rtx symbol, int priority)
6075 {
6076   if (priority == DEFAULT_INIT_PRIORITY)
6077     default_ctor_section_asm_out_constructor (symbol, priority);
6078   else
6079     {
6080       section *s;
6081       /* While priority is known to be in range [0, 65535], so 18 bytes
6082          would be enough, the compiler might not know that.  To avoid
6083          -Wformat-truncation false positive, use a larger size.  */
6084       char buf[23];
6085       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6086       s = get_section (buf, SECTION_WRITE, NULL);
6087       switch_to_section (s);
6088       assemble_align (POINTER_SIZE);
6089       assemble_aligned_integer (POINTER_BYTES, symbol);
6090     }
6091 }
6092
6093 static void
6094 aarch64_elf_asm_destructor (rtx symbol, int priority)
6095 {
6096   if (priority == DEFAULT_INIT_PRIORITY)
6097     default_dtor_section_asm_out_destructor (symbol, priority);
6098   else
6099     {
6100       section *s;
6101       /* While priority is known to be in range [0, 65535], so 18 bytes
6102          would be enough, the compiler might not know that.  To avoid
6103          -Wformat-truncation false positive, use a larger size.  */
6104       char buf[23];
6105       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6106       s = get_section (buf, SECTION_WRITE, NULL);
6107       switch_to_section (s);
6108       assemble_align (POINTER_SIZE);
6109       assemble_aligned_integer (POINTER_BYTES, symbol);
6110     }
6111 }
6112
6113 const char*
6114 aarch64_output_casesi (rtx *operands)
6115 {
6116   char buf[100];
6117   char label[100];
6118   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6119   int index;
6120   static const char *const patterns[4][2] =
6121   {
6122     {
6123       "ldrb\t%w3, [%0,%w1,uxtw]",
6124       "add\t%3, %4, %w3, sxtb #2"
6125     },
6126     {
6127       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6128       "add\t%3, %4, %w3, sxth #2"
6129     },
6130     {
6131       "ldr\t%w3, [%0,%w1,uxtw #2]",
6132       "add\t%3, %4, %w3, sxtw #2"
6133     },
6134     /* We assume that DImode is only generated when not optimizing and
6135        that we don't really need 64-bit address offsets.  That would
6136        imply an object file with 8GB of code in a single function!  */
6137     {
6138       "ldr\t%w3, [%0,%w1,uxtw #2]",
6139       "add\t%3, %4, %w3, sxtw #2"
6140     }
6141   };
6142
6143   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6144
6145   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6146   index = exact_log2 (GET_MODE_SIZE (mode));
6147
6148   gcc_assert (index >= 0 && index <= 3);
6149
6150   /* Need to implement table size reduction, by chaning the code below.  */
6151   output_asm_insn (patterns[index][0], operands);
6152   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6153   snprintf (buf, sizeof (buf),
6154             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6155   output_asm_insn (buf, operands);
6156   output_asm_insn (patterns[index][1], operands);
6157   output_asm_insn ("br\t%3", operands);
6158   assemble_label (asm_out_file, label);
6159   return "";
6160 }
6161
6162
6163 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6164    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6165    operator.  */
6166
6167 int
6168 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6169 {
6170   if (shift >= 0 && shift <= 3)
6171     {
6172       int size;
6173       for (size = 8; size <= 32; size *= 2)
6174         {
6175           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6176           if (mask == bits << shift)
6177             return size;
6178         }
6179     }
6180   return 0;
6181 }
6182
6183 /* Constant pools are per function only when PC relative
6184    literal loads are true or we are in the large memory
6185    model.  */
6186
6187 static inline bool
6188 aarch64_can_use_per_function_literal_pools_p (void)
6189 {
6190   return (aarch64_pcrelative_literal_loads
6191           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6192 }
6193
6194 static bool
6195 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6196 {
6197   /* Fixme:: In an ideal world this would work similar
6198      to the logic in aarch64_select_rtx_section but this
6199      breaks bootstrap in gcc go.  For now we workaround
6200      this by returning false here.  */
6201   return false;
6202 }
6203
6204 /* Select appropriate section for constants depending
6205    on where we place literal pools.  */
6206
6207 static section *
6208 aarch64_select_rtx_section (machine_mode mode,
6209                             rtx x,
6210                             unsigned HOST_WIDE_INT align)
6211 {
6212   if (aarch64_can_use_per_function_literal_pools_p ())
6213     return function_section (current_function_decl);
6214
6215   return default_elf_select_rtx_section (mode, x, align);
6216 }
6217
6218 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6219 void
6220 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6221                                   HOST_WIDE_INT offset)
6222 {
6223   /* When using per-function literal pools, we must ensure that any code
6224      section is aligned to the minimal instruction length, lest we get
6225      errors from the assembler re "unaligned instructions".  */
6226   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6227     ASM_OUTPUT_ALIGN (f, 2);
6228 }
6229
6230 /* Costs.  */
6231
6232 /* Helper function for rtx cost calculation.  Strip a shift expression
6233    from X.  Returns the inner operand if successful, or the original
6234    expression on failure.  */
6235 static rtx
6236 aarch64_strip_shift (rtx x)
6237 {
6238   rtx op = x;
6239
6240   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6241      we can convert both to ROR during final output.  */
6242   if ((GET_CODE (op) == ASHIFT
6243        || GET_CODE (op) == ASHIFTRT
6244        || GET_CODE (op) == LSHIFTRT
6245        || GET_CODE (op) == ROTATERT
6246        || GET_CODE (op) == ROTATE)
6247       && CONST_INT_P (XEXP (op, 1)))
6248     return XEXP (op, 0);
6249
6250   if (GET_CODE (op) == MULT
6251       && CONST_INT_P (XEXP (op, 1))
6252       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6253     return XEXP (op, 0);
6254
6255   return x;
6256 }
6257
6258 /* Helper function for rtx cost calculation.  Strip an extend
6259    expression from X.  Returns the inner operand if successful, or the
6260    original expression on failure.  We deal with a number of possible
6261    canonicalization variations here. If STRIP_SHIFT is true, then
6262    we can strip off a shift also.  */
6263 static rtx
6264 aarch64_strip_extend (rtx x, bool strip_shift)
6265 {
6266   scalar_int_mode mode;
6267   rtx op = x;
6268
6269   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6270     return op;
6271
6272   /* Zero and sign extraction of a widened value.  */
6273   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6274       && XEXP (op, 2) == const0_rtx
6275       && GET_CODE (XEXP (op, 0)) == MULT
6276       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6277                                          XEXP (op, 1)))
6278     return XEXP (XEXP (op, 0), 0);
6279
6280   /* It can also be represented (for zero-extend) as an AND with an
6281      immediate.  */
6282   if (GET_CODE (op) == AND
6283       && GET_CODE (XEXP (op, 0)) == MULT
6284       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6285       && CONST_INT_P (XEXP (op, 1))
6286       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6287                            INTVAL (XEXP (op, 1))) != 0)
6288     return XEXP (XEXP (op, 0), 0);
6289
6290   /* Now handle extended register, as this may also have an optional
6291      left shift by 1..4.  */
6292   if (strip_shift
6293       && GET_CODE (op) == ASHIFT
6294       && CONST_INT_P (XEXP (op, 1))
6295       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6296     op = XEXP (op, 0);
6297
6298   if (GET_CODE (op) == ZERO_EXTEND
6299       || GET_CODE (op) == SIGN_EXTEND)
6300     op = XEXP (op, 0);
6301
6302   if (op != x)
6303     return op;
6304
6305   return x;
6306 }
6307
6308 /* Return true iff CODE is a shift supported in combination
6309    with arithmetic instructions.  */
6310
6311 static bool
6312 aarch64_shift_p (enum rtx_code code)
6313 {
6314   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6315 }
6316
6317
6318 /* Return true iff X is a cheap shift without a sign extend. */
6319
6320 static bool
6321 aarch64_cheap_mult_shift_p (rtx x)
6322 {
6323   rtx op0, op1;
6324
6325   op0 = XEXP (x, 0);
6326   op1 = XEXP (x, 1);
6327
6328   if (!(aarch64_tune_params.extra_tuning_flags
6329                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6330     return false;
6331
6332   if (GET_CODE (op0) == SIGN_EXTEND)
6333     return false;
6334
6335   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6336       && UINTVAL (op1) <= 4)
6337     return true;
6338
6339   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6340     return false;
6341
6342   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6343
6344   if (l2 > 0 && l2 <= 4)
6345     return true;
6346
6347   return false;
6348 }
6349
6350 /* Helper function for rtx cost calculation.  Calculate the cost of
6351    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6352    Return the calculated cost of the expression, recursing manually in to
6353    operands where needed.  */
6354
6355 static int
6356 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6357 {
6358   rtx op0, op1;
6359   const struct cpu_cost_table *extra_cost
6360     = aarch64_tune_params.insn_extra_cost;
6361   int cost = 0;
6362   bool compound_p = (outer == PLUS || outer == MINUS);
6363   machine_mode mode = GET_MODE (x);
6364
6365   gcc_checking_assert (code == MULT);
6366
6367   op0 = XEXP (x, 0);
6368   op1 = XEXP (x, 1);
6369
6370   if (VECTOR_MODE_P (mode))
6371     mode = GET_MODE_INNER (mode);
6372
6373   /* Integer multiply/fma.  */
6374   if (GET_MODE_CLASS (mode) == MODE_INT)
6375     {
6376       /* The multiply will be canonicalized as a shift, cost it as such.  */
6377       if (aarch64_shift_p (GET_CODE (x))
6378           || (CONST_INT_P (op1)
6379               && exact_log2 (INTVAL (op1)) > 0))
6380         {
6381           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6382                            || GET_CODE (op0) == SIGN_EXTEND;
6383           if (speed)
6384             {
6385               if (compound_p)
6386                 {
6387                   /* If the shift is considered cheap,
6388                      then don't add any cost. */
6389                   if (aarch64_cheap_mult_shift_p (x))
6390                     ;
6391                   else if (REG_P (op1))
6392                     /* ARITH + shift-by-register.  */
6393                     cost += extra_cost->alu.arith_shift_reg;
6394                   else if (is_extend)
6395                     /* ARITH + extended register.  We don't have a cost field
6396                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6397                     cost += extra_cost->alu.extend_arith;
6398                   else
6399                     /* ARITH + shift-by-immediate.  */
6400                     cost += extra_cost->alu.arith_shift;
6401                 }
6402               else
6403                 /* LSL (immediate).  */
6404                 cost += extra_cost->alu.shift;
6405
6406             }
6407           /* Strip extends as we will have costed them in the case above.  */
6408           if (is_extend)
6409             op0 = aarch64_strip_extend (op0, true);
6410
6411           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6412
6413           return cost;
6414         }
6415
6416       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6417          compound and let the below cases handle it.  After all, MNEG is a
6418          special-case alias of MSUB.  */
6419       if (GET_CODE (op0) == NEG)
6420         {
6421           op0 = XEXP (op0, 0);
6422           compound_p = true;
6423         }
6424
6425       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6426       if ((GET_CODE (op0) == ZERO_EXTEND
6427            && GET_CODE (op1) == ZERO_EXTEND)
6428           || (GET_CODE (op0) == SIGN_EXTEND
6429               && GET_CODE (op1) == SIGN_EXTEND))
6430         {
6431           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6432           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6433
6434           if (speed)
6435             {
6436               if (compound_p)
6437                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6438                 cost += extra_cost->mult[0].extend_add;
6439               else
6440                 /* MUL/SMULL/UMULL.  */
6441                 cost += extra_cost->mult[0].extend;
6442             }
6443
6444           return cost;
6445         }
6446
6447       /* This is either an integer multiply or a MADD.  In both cases
6448          we want to recurse and cost the operands.  */
6449       cost += rtx_cost (op0, mode, MULT, 0, speed);
6450       cost += rtx_cost (op1, mode, MULT, 1, speed);
6451
6452       if (speed)
6453         {
6454           if (compound_p)
6455             /* MADD/MSUB.  */
6456             cost += extra_cost->mult[mode == DImode].add;
6457           else
6458             /* MUL.  */
6459             cost += extra_cost->mult[mode == DImode].simple;
6460         }
6461
6462       return cost;
6463     }
6464   else
6465     {
6466       if (speed)
6467         {
6468           /* Floating-point FMA/FMUL can also support negations of the
6469              operands, unless the rounding mode is upward or downward in
6470              which case FNMUL is different than FMUL with operand negation.  */
6471           bool neg0 = GET_CODE (op0) == NEG;
6472           bool neg1 = GET_CODE (op1) == NEG;
6473           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6474             {
6475               if (neg0)
6476                 op0 = XEXP (op0, 0);
6477               if (neg1)
6478                 op1 = XEXP (op1, 0);
6479             }
6480
6481           if (compound_p)
6482             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6483             cost += extra_cost->fp[mode == DFmode].fma;
6484           else
6485             /* FMUL/FNMUL.  */
6486             cost += extra_cost->fp[mode == DFmode].mult;
6487         }
6488
6489       cost += rtx_cost (op0, mode, MULT, 0, speed);
6490       cost += rtx_cost (op1, mode, MULT, 1, speed);
6491       return cost;
6492     }
6493 }
6494
6495 static int
6496 aarch64_address_cost (rtx x,
6497                       machine_mode mode,
6498                       addr_space_t as ATTRIBUTE_UNUSED,
6499                       bool speed)
6500 {
6501   enum rtx_code c = GET_CODE (x);
6502   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6503   struct aarch64_address_info info;
6504   int cost = 0;
6505   info.shift = 0;
6506
6507   if (!aarch64_classify_address (&info, x, mode, c, false))
6508     {
6509       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6510         {
6511           /* This is a CONST or SYMBOL ref which will be split
6512              in a different way depending on the code model in use.
6513              Cost it through the generic infrastructure.  */
6514           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6515           /* Divide through by the cost of one instruction to
6516              bring it to the same units as the address costs.  */
6517           cost_symbol_ref /= COSTS_N_INSNS (1);
6518           /* The cost is then the cost of preparing the address,
6519              followed by an immediate (possibly 0) offset.  */
6520           return cost_symbol_ref + addr_cost->imm_offset;
6521         }
6522       else
6523         {
6524           /* This is most likely a jump table from a case
6525              statement.  */
6526           return addr_cost->register_offset;
6527         }
6528     }
6529
6530   switch (info.type)
6531     {
6532       case ADDRESS_LO_SUM:
6533       case ADDRESS_SYMBOLIC:
6534       case ADDRESS_REG_IMM:
6535         cost += addr_cost->imm_offset;
6536         break;
6537
6538       case ADDRESS_REG_WB:
6539         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6540           cost += addr_cost->pre_modify;
6541         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6542           cost += addr_cost->post_modify;
6543         else
6544           gcc_unreachable ();
6545
6546         break;
6547
6548       case ADDRESS_REG_REG:
6549         cost += addr_cost->register_offset;
6550         break;
6551
6552       case ADDRESS_REG_SXTW:
6553         cost += addr_cost->register_sextend;
6554         break;
6555
6556       case ADDRESS_REG_UXTW:
6557         cost += addr_cost->register_zextend;
6558         break;
6559
6560       default:
6561         gcc_unreachable ();
6562     }
6563
6564
6565   if (info.shift > 0)
6566     {
6567       /* For the sake of calculating the cost of the shifted register
6568          component, we can treat same sized modes in the same way.  */
6569       switch (GET_MODE_BITSIZE (mode))
6570         {
6571           case 16:
6572             cost += addr_cost->addr_scale_costs.hi;
6573             break;
6574
6575           case 32:
6576             cost += addr_cost->addr_scale_costs.si;
6577             break;
6578
6579           case 64:
6580             cost += addr_cost->addr_scale_costs.di;
6581             break;
6582
6583           /* We can't tell, or this is a 128-bit vector.  */
6584           default:
6585             cost += addr_cost->addr_scale_costs.ti;
6586             break;
6587         }
6588     }
6589
6590   return cost;
6591 }
6592
6593 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6594    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6595    to be taken.  */
6596
6597 int
6598 aarch64_branch_cost (bool speed_p, bool predictable_p)
6599 {
6600   /* When optimizing for speed, use the cost of unpredictable branches.  */
6601   const struct cpu_branch_cost *branch_costs =
6602     aarch64_tune_params.branch_costs;
6603
6604   if (!speed_p || predictable_p)
6605     return branch_costs->predictable;
6606   else
6607     return branch_costs->unpredictable;
6608 }
6609
6610 /* Return true if the RTX X in mode MODE is a zero or sign extract
6611    usable in an ADD or SUB (extended register) instruction.  */
6612 static bool
6613 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6614 {
6615   /* Catch add with a sign extract.
6616      This is add_<optab><mode>_multp2.  */
6617   if (GET_CODE (x) == SIGN_EXTRACT
6618       || GET_CODE (x) == ZERO_EXTRACT)
6619     {
6620       rtx op0 = XEXP (x, 0);
6621       rtx op1 = XEXP (x, 1);
6622       rtx op2 = XEXP (x, 2);
6623
6624       if (GET_CODE (op0) == MULT
6625           && CONST_INT_P (op1)
6626           && op2 == const0_rtx
6627           && CONST_INT_P (XEXP (op0, 1))
6628           && aarch64_is_extend_from_extract (mode,
6629                                              XEXP (op0, 1),
6630                                              op1))
6631         {
6632           return true;
6633         }
6634     }
6635   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6636      No shift.  */
6637   else if (GET_CODE (x) == SIGN_EXTEND
6638            || GET_CODE (x) == ZERO_EXTEND)
6639     return REG_P (XEXP (x, 0));
6640
6641   return false;
6642 }
6643
6644 static bool
6645 aarch64_frint_unspec_p (unsigned int u)
6646 {
6647   switch (u)
6648     {
6649       case UNSPEC_FRINTZ:
6650       case UNSPEC_FRINTP:
6651       case UNSPEC_FRINTM:
6652       case UNSPEC_FRINTA:
6653       case UNSPEC_FRINTN:
6654       case UNSPEC_FRINTX:
6655       case UNSPEC_FRINTI:
6656         return true;
6657
6658       default:
6659         return false;
6660     }
6661 }
6662
6663 /* Return true iff X is an rtx that will match an extr instruction
6664    i.e. as described in the *extr<mode>5_insn family of patterns.
6665    OP0 and OP1 will be set to the operands of the shifts involved
6666    on success and will be NULL_RTX otherwise.  */
6667
6668 static bool
6669 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6670 {
6671   rtx op0, op1;
6672   scalar_int_mode mode;
6673   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6674     return false;
6675
6676   *res_op0 = NULL_RTX;
6677   *res_op1 = NULL_RTX;
6678
6679   if (GET_CODE (x) != IOR)
6680     return false;
6681
6682   op0 = XEXP (x, 0);
6683   op1 = XEXP (x, 1);
6684
6685   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6686       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6687     {
6688      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6689       if (GET_CODE (op1) == ASHIFT)
6690         std::swap (op0, op1);
6691
6692       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6693         return false;
6694
6695       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6696       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6697
6698       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6699           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6700         {
6701           *res_op0 = XEXP (op0, 0);
6702           *res_op1 = XEXP (op1, 0);
6703           return true;
6704         }
6705     }
6706
6707   return false;
6708 }
6709
6710 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6711    storing it in *COST.  Result is true if the total cost of the operation
6712    has now been calculated.  */
6713 static bool
6714 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6715 {
6716   rtx inner;
6717   rtx comparator;
6718   enum rtx_code cmpcode;
6719
6720   if (COMPARISON_P (op0))
6721     {
6722       inner = XEXP (op0, 0);
6723       comparator = XEXP (op0, 1);
6724       cmpcode = GET_CODE (op0);
6725     }
6726   else
6727     {
6728       inner = op0;
6729       comparator = const0_rtx;
6730       cmpcode = NE;
6731     }
6732
6733   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6734     {
6735       /* Conditional branch.  */
6736       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6737         return true;
6738       else
6739         {
6740           if (cmpcode == NE || cmpcode == EQ)
6741             {
6742               if (comparator == const0_rtx)
6743                 {
6744                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6745                   if (GET_CODE (inner) == ZERO_EXTRACT)
6746                     /* TBZ/TBNZ.  */
6747                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6748                                        ZERO_EXTRACT, 0, speed);
6749                   else
6750                     /* CBZ/CBNZ.  */
6751                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6752
6753                 return true;
6754               }
6755             }
6756           else if (cmpcode == LT || cmpcode == GE)
6757             {
6758               /* TBZ/TBNZ.  */
6759               if (comparator == const0_rtx)
6760                 return true;
6761             }
6762         }
6763     }
6764   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6765     {
6766       /* CCMP.  */
6767       if (GET_CODE (op1) == COMPARE)
6768         {
6769           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6770           if (XEXP (op1, 1) == const0_rtx)
6771             *cost += 1;
6772           if (speed)
6773             {
6774               machine_mode mode = GET_MODE (XEXP (op1, 0));
6775               const struct cpu_cost_table *extra_cost
6776                 = aarch64_tune_params.insn_extra_cost;
6777
6778               if (GET_MODE_CLASS (mode) == MODE_INT)
6779                 *cost += extra_cost->alu.arith;
6780               else
6781                 *cost += extra_cost->fp[mode == DFmode].compare;
6782             }
6783           return true;
6784         }
6785
6786       /* It's a conditional operation based on the status flags,
6787          so it must be some flavor of CSEL.  */
6788
6789       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6790       if (GET_CODE (op1) == NEG
6791           || GET_CODE (op1) == NOT
6792           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6793         op1 = XEXP (op1, 0);
6794       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6795         {
6796           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6797           op1 = XEXP (op1, 0);
6798           op2 = XEXP (op2, 0);
6799         }
6800
6801       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6802       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6803       return true;
6804     }
6805
6806   /* We don't know what this is, cost all operands.  */
6807   return false;
6808 }
6809
6810 /* Check whether X is a bitfield operation of the form shift + extend that
6811    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6812    operand to which the bitfield operation is applied.  Otherwise return
6813    NULL_RTX.  */
6814
6815 static rtx
6816 aarch64_extend_bitfield_pattern_p (rtx x)
6817 {
6818   rtx_code outer_code = GET_CODE (x);
6819   machine_mode outer_mode = GET_MODE (x);
6820
6821   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6822       && outer_mode != SImode && outer_mode != DImode)
6823     return NULL_RTX;
6824
6825   rtx inner = XEXP (x, 0);
6826   rtx_code inner_code = GET_CODE (inner);
6827   machine_mode inner_mode = GET_MODE (inner);
6828   rtx op = NULL_RTX;
6829
6830   switch (inner_code)
6831     {
6832       case ASHIFT:
6833         if (CONST_INT_P (XEXP (inner, 1))
6834             && (inner_mode == QImode || inner_mode == HImode))
6835           op = XEXP (inner, 0);
6836         break;
6837       case LSHIFTRT:
6838         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6839             && (inner_mode == QImode || inner_mode == HImode))
6840           op = XEXP (inner, 0);
6841         break;
6842       case ASHIFTRT:
6843         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6844             && (inner_mode == QImode || inner_mode == HImode))
6845           op = XEXP (inner, 0);
6846         break;
6847       default:
6848         break;
6849     }
6850
6851   return op;
6852 }
6853
6854 /* Return true if the mask and a shift amount from an RTX of the form
6855    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6856    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6857
6858 bool
6859 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6860                                     rtx shft_amnt)
6861 {
6862   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6863          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6864          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6865          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6866 }
6867
6868 /* Calculate the cost of calculating X, storing it in *COST.  Result
6869    is true if the total cost of the operation has now been calculated.  */
6870 static bool
6871 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6872                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6873 {
6874   rtx op0, op1, op2;
6875   const struct cpu_cost_table *extra_cost
6876     = aarch64_tune_params.insn_extra_cost;
6877   int code = GET_CODE (x);
6878   scalar_int_mode int_mode;
6879
6880   /* By default, assume that everything has equivalent cost to the
6881      cheapest instruction.  Any additional costs are applied as a delta
6882      above this default.  */
6883   *cost = COSTS_N_INSNS (1);
6884
6885   switch (code)
6886     {
6887     case SET:
6888       /* The cost depends entirely on the operands to SET.  */
6889       *cost = 0;
6890       op0 = SET_DEST (x);
6891       op1 = SET_SRC (x);
6892
6893       switch (GET_CODE (op0))
6894         {
6895         case MEM:
6896           if (speed)
6897             {
6898               rtx address = XEXP (op0, 0);
6899               if (VECTOR_MODE_P (mode))
6900                 *cost += extra_cost->ldst.storev;
6901               else if (GET_MODE_CLASS (mode) == MODE_INT)
6902                 *cost += extra_cost->ldst.store;
6903               else if (mode == SFmode)
6904                 *cost += extra_cost->ldst.storef;
6905               else if (mode == DFmode)
6906                 *cost += extra_cost->ldst.stored;
6907
6908               *cost +=
6909                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6910                                                      0, speed));
6911             }
6912
6913           *cost += rtx_cost (op1, mode, SET, 1, speed);
6914           return true;
6915
6916         case SUBREG:
6917           if (! REG_P (SUBREG_REG (op0)))
6918             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6919
6920           /* Fall through.  */
6921         case REG:
6922           /* The cost is one per vector-register copied.  */
6923           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6924             {
6925               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6926                               / GET_MODE_SIZE (V4SImode);
6927               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6928             }
6929           /* const0_rtx is in general free, but we will use an
6930              instruction to set a register to 0.  */
6931           else if (REG_P (op1) || op1 == const0_rtx)
6932             {
6933               /* The cost is 1 per register copied.  */
6934               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6935                               / UNITS_PER_WORD;
6936               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6937             }
6938           else
6939             /* Cost is just the cost of the RHS of the set.  */
6940             *cost += rtx_cost (op1, mode, SET, 1, speed);
6941           return true;
6942
6943         case ZERO_EXTRACT:
6944         case SIGN_EXTRACT:
6945           /* Bit-field insertion.  Strip any redundant widening of
6946              the RHS to meet the width of the target.  */
6947           if (GET_CODE (op1) == SUBREG)
6948             op1 = SUBREG_REG (op1);
6949           if ((GET_CODE (op1) == ZERO_EXTEND
6950                || GET_CODE (op1) == SIGN_EXTEND)
6951               && CONST_INT_P (XEXP (op0, 1))
6952               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6953               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6954             op1 = XEXP (op1, 0);
6955
6956           if (CONST_INT_P (op1))
6957             {
6958               /* MOV immediate is assumed to always be cheap.  */
6959               *cost = COSTS_N_INSNS (1);
6960             }
6961           else
6962             {
6963               /* BFM.  */
6964               if (speed)
6965                 *cost += extra_cost->alu.bfi;
6966               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6967             }
6968
6969           return true;
6970
6971         default:
6972           /* We can't make sense of this, assume default cost.  */
6973           *cost = COSTS_N_INSNS (1);
6974           return false;
6975         }
6976       return false;
6977
6978     case CONST_INT:
6979       /* If an instruction can incorporate a constant within the
6980          instruction, the instruction's expression avoids calling
6981          rtx_cost() on the constant.  If rtx_cost() is called on a
6982          constant, then it is usually because the constant must be
6983          moved into a register by one or more instructions.
6984
6985          The exception is constant 0, which can be expressed
6986          as XZR/WZR and is therefore free.  The exception to this is
6987          if we have (set (reg) (const0_rtx)) in which case we must cost
6988          the move.  However, we can catch that when we cost the SET, so
6989          we don't need to consider that here.  */
6990       if (x == const0_rtx)
6991         *cost = 0;
6992       else
6993         {
6994           /* To an approximation, building any other constant is
6995              proportionally expensive to the number of instructions
6996              required to build that constant.  This is true whether we
6997              are compiling for SPEED or otherwise.  */
6998           if (!is_a <scalar_int_mode> (mode, &int_mode))
6999             int_mode = word_mode;
7000           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7001                                  (NULL_RTX, x, false, int_mode));
7002         }
7003       return true;
7004
7005     case CONST_DOUBLE:
7006
7007       /* First determine number of instructions to do the move
7008           as an integer constant.  */
7009       if (!aarch64_float_const_representable_p (x)
7010            && !aarch64_can_const_movi_rtx_p (x, mode)
7011            && aarch64_float_const_rtx_p (x))
7012         {
7013           unsigned HOST_WIDE_INT ival;
7014           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7015           gcc_assert (succeed);
7016
7017           scalar_int_mode imode = (mode == HFmode
7018                                    ? SImode
7019                                    : int_mode_for_mode (mode).require ());
7020           int ncost = aarch64_internal_mov_immediate
7021                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7022           *cost += COSTS_N_INSNS (ncost);
7023           return true;
7024         }
7025
7026       if (speed)
7027         {
7028           /* mov[df,sf]_aarch64.  */
7029           if (aarch64_float_const_representable_p (x))
7030             /* FMOV (scalar immediate).  */
7031             *cost += extra_cost->fp[mode == DFmode].fpconst;
7032           else if (!aarch64_float_const_zero_rtx_p (x))
7033             {
7034               /* This will be a load from memory.  */
7035               if (mode == DFmode)
7036                 *cost += extra_cost->ldst.loadd;
7037               else
7038                 *cost += extra_cost->ldst.loadf;
7039             }
7040           else
7041             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7042                or MOV v0.s[0], wzr - neither of which are modeled by the
7043                cost tables.  Just use the default cost.  */
7044             {
7045             }
7046         }
7047
7048       return true;
7049
7050     case MEM:
7051       if (speed)
7052         {
7053           /* For loads we want the base cost of a load, plus an
7054              approximation for the additional cost of the addressing
7055              mode.  */
7056           rtx address = XEXP (x, 0);
7057           if (VECTOR_MODE_P (mode))
7058             *cost += extra_cost->ldst.loadv;
7059           else if (GET_MODE_CLASS (mode) == MODE_INT)
7060             *cost += extra_cost->ldst.load;
7061           else if (mode == SFmode)
7062             *cost += extra_cost->ldst.loadf;
7063           else if (mode == DFmode)
7064             *cost += extra_cost->ldst.loadd;
7065
7066           *cost +=
7067                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7068                                                      0, speed));
7069         }
7070
7071       return true;
7072
7073     case NEG:
7074       op0 = XEXP (x, 0);
7075
7076       if (VECTOR_MODE_P (mode))
7077         {
7078           if (speed)
7079             {
7080               /* FNEG.  */
7081               *cost += extra_cost->vect.alu;
7082             }
7083           return false;
7084         }
7085
7086       if (GET_MODE_CLASS (mode) == MODE_INT)
7087         {
7088           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7089               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7090             {
7091               /* CSETM.  */
7092               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7093               return true;
7094             }
7095
7096           /* Cost this as SUB wzr, X.  */
7097           op0 = CONST0_RTX (mode);
7098           op1 = XEXP (x, 0);
7099           goto cost_minus;
7100         }
7101
7102       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7103         {
7104           /* Support (neg(fma...)) as a single instruction only if
7105              sign of zeros is unimportant.  This matches the decision
7106              making in aarch64.md.  */
7107           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7108             {
7109               /* FNMADD.  */
7110               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7111               return true;
7112             }
7113           if (GET_CODE (op0) == MULT)
7114             {
7115               /* FNMUL.  */
7116               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7117               return true;
7118             }
7119           if (speed)
7120             /* FNEG.  */
7121             *cost += extra_cost->fp[mode == DFmode].neg;
7122           return false;
7123         }
7124
7125       return false;
7126
7127     case CLRSB:
7128     case CLZ:
7129       if (speed)
7130         {
7131           if (VECTOR_MODE_P (mode))
7132             *cost += extra_cost->vect.alu;
7133           else
7134             *cost += extra_cost->alu.clz;
7135         }
7136
7137       return false;
7138
7139     case COMPARE:
7140       op0 = XEXP (x, 0);
7141       op1 = XEXP (x, 1);
7142
7143       if (op1 == const0_rtx
7144           && GET_CODE (op0) == AND)
7145         {
7146           x = op0;
7147           mode = GET_MODE (op0);
7148           goto cost_logic;
7149         }
7150
7151       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7152         {
7153           /* TODO: A write to the CC flags possibly costs extra, this
7154              needs encoding in the cost tables.  */
7155
7156           mode = GET_MODE (op0);
7157           /* ANDS.  */
7158           if (GET_CODE (op0) == AND)
7159             {
7160               x = op0;
7161               goto cost_logic;
7162             }
7163
7164           if (GET_CODE (op0) == PLUS)
7165             {
7166               /* ADDS (and CMN alias).  */
7167               x = op0;
7168               goto cost_plus;
7169             }
7170
7171           if (GET_CODE (op0) == MINUS)
7172             {
7173               /* SUBS.  */
7174               x = op0;
7175               goto cost_minus;
7176             }
7177
7178           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7179               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7180               && CONST_INT_P (XEXP (op0, 2)))
7181             {
7182               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7183                  Handle it here directly rather than going to cost_logic
7184                  since we know the immediate generated for the TST is valid
7185                  so we can avoid creating an intermediate rtx for it only
7186                  for costing purposes.  */
7187               if (speed)
7188                 *cost += extra_cost->alu.logical;
7189
7190               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7191                                  ZERO_EXTRACT, 0, speed);
7192               return true;
7193             }
7194
7195           if (GET_CODE (op1) == NEG)
7196             {
7197               /* CMN.  */
7198               if (speed)
7199                 *cost += extra_cost->alu.arith;
7200
7201               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7202               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7203               return true;
7204             }
7205
7206           /* CMP.
7207
7208              Compare can freely swap the order of operands, and
7209              canonicalization puts the more complex operation first.
7210              But the integer MINUS logic expects the shift/extend
7211              operation in op1.  */
7212           if (! (REG_P (op0)
7213                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7214           {
7215             op0 = XEXP (x, 1);
7216             op1 = XEXP (x, 0);
7217           }
7218           goto cost_minus;
7219         }
7220
7221       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7222         {
7223           /* FCMP.  */
7224           if (speed)
7225             *cost += extra_cost->fp[mode == DFmode].compare;
7226
7227           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7228             {
7229               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7230               /* FCMP supports constant 0.0 for no extra cost. */
7231               return true;
7232             }
7233           return false;
7234         }
7235
7236       if (VECTOR_MODE_P (mode))
7237         {
7238           /* Vector compare.  */
7239           if (speed)
7240             *cost += extra_cost->vect.alu;
7241
7242           if (aarch64_float_const_zero_rtx_p (op1))
7243             {
7244               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7245                  cost.  */
7246               return true;
7247             }
7248           return false;
7249         }
7250       return false;
7251
7252     case MINUS:
7253       {
7254         op0 = XEXP (x, 0);
7255         op1 = XEXP (x, 1);
7256
7257 cost_minus:
7258         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7259
7260         /* Detect valid immediates.  */
7261         if ((GET_MODE_CLASS (mode) == MODE_INT
7262              || (GET_MODE_CLASS (mode) == MODE_CC
7263                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7264             && CONST_INT_P (op1)
7265             && aarch64_uimm12_shift (INTVAL (op1)))
7266           {
7267             if (speed)
7268               /* SUB(S) (immediate).  */
7269               *cost += extra_cost->alu.arith;
7270             return true;
7271           }
7272
7273         /* Look for SUB (extended register).  */
7274         if (is_a <scalar_int_mode> (mode, &int_mode)
7275             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7276           {
7277             if (speed)
7278               *cost += extra_cost->alu.extend_arith;
7279
7280             op1 = aarch64_strip_extend (op1, true);
7281             *cost += rtx_cost (op1, VOIDmode,
7282                                (enum rtx_code) GET_CODE (op1), 0, speed);
7283             return true;
7284           }
7285
7286         rtx new_op1 = aarch64_strip_extend (op1, false);
7287
7288         /* Cost this as an FMA-alike operation.  */
7289         if ((GET_CODE (new_op1) == MULT
7290              || aarch64_shift_p (GET_CODE (new_op1)))
7291             && code != COMPARE)
7292           {
7293             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7294                                             (enum rtx_code) code,
7295                                             speed);
7296             return true;
7297           }
7298
7299         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7300
7301         if (speed)
7302           {
7303             if (VECTOR_MODE_P (mode))
7304               {
7305                 /* Vector SUB.  */
7306                 *cost += extra_cost->vect.alu;
7307               }
7308             else if (GET_MODE_CLASS (mode) == MODE_INT)
7309               {
7310                 /* SUB(S).  */
7311                 *cost += extra_cost->alu.arith;
7312               }
7313             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7314               {
7315                 /* FSUB.  */
7316                 *cost += extra_cost->fp[mode == DFmode].addsub;
7317               }
7318           }
7319         return true;
7320       }
7321
7322     case PLUS:
7323       {
7324         rtx new_op0;
7325
7326         op0 = XEXP (x, 0);
7327         op1 = XEXP (x, 1);
7328
7329 cost_plus:
7330         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7331             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7332           {
7333             /* CSINC.  */
7334             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7335             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7336             return true;
7337           }
7338
7339         if (GET_MODE_CLASS (mode) == MODE_INT
7340             && CONST_INT_P (op1)
7341             && aarch64_uimm12_shift (INTVAL (op1)))
7342           {
7343             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7344
7345             if (speed)
7346               /* ADD (immediate).  */
7347               *cost += extra_cost->alu.arith;
7348             return true;
7349           }
7350
7351         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7352
7353         /* Look for ADD (extended register).  */
7354         if (is_a <scalar_int_mode> (mode, &int_mode)
7355             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7356           {
7357             if (speed)
7358               *cost += extra_cost->alu.extend_arith;
7359
7360             op0 = aarch64_strip_extend (op0, true);
7361             *cost += rtx_cost (op0, VOIDmode,
7362                                (enum rtx_code) GET_CODE (op0), 0, speed);
7363             return true;
7364           }
7365
7366         /* Strip any extend, leave shifts behind as we will
7367            cost them through mult_cost.  */
7368         new_op0 = aarch64_strip_extend (op0, false);
7369
7370         if (GET_CODE (new_op0) == MULT
7371             || aarch64_shift_p (GET_CODE (new_op0)))
7372           {
7373             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7374                                             speed);
7375             return true;
7376           }
7377
7378         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7379
7380         if (speed)
7381           {
7382             if (VECTOR_MODE_P (mode))
7383               {
7384                 /* Vector ADD.  */
7385                 *cost += extra_cost->vect.alu;
7386               }
7387             else if (GET_MODE_CLASS (mode) == MODE_INT)
7388               {
7389                 /* ADD.  */
7390                 *cost += extra_cost->alu.arith;
7391               }
7392             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7393               {
7394                 /* FADD.  */
7395                 *cost += extra_cost->fp[mode == DFmode].addsub;
7396               }
7397           }
7398         return true;
7399       }
7400
7401     case BSWAP:
7402       *cost = COSTS_N_INSNS (1);
7403
7404       if (speed)
7405         {
7406           if (VECTOR_MODE_P (mode))
7407             *cost += extra_cost->vect.alu;
7408           else
7409             *cost += extra_cost->alu.rev;
7410         }
7411       return false;
7412
7413     case IOR:
7414       if (aarch_rev16_p (x))
7415         {
7416           *cost = COSTS_N_INSNS (1);
7417
7418           if (speed)
7419             {
7420               if (VECTOR_MODE_P (mode))
7421                 *cost += extra_cost->vect.alu;
7422               else
7423                 *cost += extra_cost->alu.rev;
7424             }
7425           return true;
7426         }
7427
7428       if (aarch64_extr_rtx_p (x, &op0, &op1))
7429         {
7430           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7431           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7432           if (speed)
7433             *cost += extra_cost->alu.shift;
7434
7435           return true;
7436         }
7437     /* Fall through.  */
7438     case XOR:
7439     case AND:
7440     cost_logic:
7441       op0 = XEXP (x, 0);
7442       op1 = XEXP (x, 1);
7443
7444       if (VECTOR_MODE_P (mode))
7445         {
7446           if (speed)
7447             *cost += extra_cost->vect.alu;
7448           return true;
7449         }
7450
7451       if (code == AND
7452           && GET_CODE (op0) == MULT
7453           && CONST_INT_P (XEXP (op0, 1))
7454           && CONST_INT_P (op1)
7455           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7456                                INTVAL (op1)) != 0)
7457         {
7458           /* This is a UBFM/SBFM.  */
7459           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7460           if (speed)
7461             *cost += extra_cost->alu.bfx;
7462           return true;
7463         }
7464
7465       if (is_int_mode (mode, &int_mode))
7466         {
7467           if (CONST_INT_P (op1))
7468             {
7469               /* We have a mask + shift version of a UBFIZ
7470                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7471               if (GET_CODE (op0) == ASHIFT
7472                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7473                                                          XEXP (op0, 1)))
7474                 {
7475                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7476                                      (enum rtx_code) code, 0, speed);
7477                   if (speed)
7478                     *cost += extra_cost->alu.bfx;
7479
7480                   return true;
7481                 }
7482               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7483                 {
7484                 /* We possibly get the immediate for free, this is not
7485                    modelled.  */
7486                   *cost += rtx_cost (op0, int_mode,
7487                                      (enum rtx_code) code, 0, speed);
7488                   if (speed)
7489                     *cost += extra_cost->alu.logical;
7490
7491                   return true;
7492                 }
7493             }
7494           else
7495             {
7496               rtx new_op0 = op0;
7497
7498               /* Handle ORN, EON, or BIC.  */
7499               if (GET_CODE (op0) == NOT)
7500                 op0 = XEXP (op0, 0);
7501
7502               new_op0 = aarch64_strip_shift (op0);
7503
7504               /* If we had a shift on op0 then this is a logical-shift-
7505                  by-register/immediate operation.  Otherwise, this is just
7506                  a logical operation.  */
7507               if (speed)
7508                 {
7509                   if (new_op0 != op0)
7510                     {
7511                       /* Shift by immediate.  */
7512                       if (CONST_INT_P (XEXP (op0, 1)))
7513                         *cost += extra_cost->alu.log_shift;
7514                       else
7515                         *cost += extra_cost->alu.log_shift_reg;
7516                     }
7517                   else
7518                     *cost += extra_cost->alu.logical;
7519                 }
7520
7521               /* In both cases we want to cost both operands.  */
7522               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7523                                  0, speed);
7524               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7525                                  1, speed);
7526
7527               return true;
7528             }
7529         }
7530       return false;
7531
7532     case NOT:
7533       x = XEXP (x, 0);
7534       op0 = aarch64_strip_shift (x);
7535
7536       if (VECTOR_MODE_P (mode))
7537         {
7538           /* Vector NOT.  */
7539           *cost += extra_cost->vect.alu;
7540           return false;
7541         }
7542
7543       /* MVN-shifted-reg.  */
7544       if (op0 != x)
7545         {
7546           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7547
7548           if (speed)
7549             *cost += extra_cost->alu.log_shift;
7550
7551           return true;
7552         }
7553       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7554          Handle the second form here taking care that 'a' in the above can
7555          be a shift.  */
7556       else if (GET_CODE (op0) == XOR)
7557         {
7558           rtx newop0 = XEXP (op0, 0);
7559           rtx newop1 = XEXP (op0, 1);
7560           rtx op0_stripped = aarch64_strip_shift (newop0);
7561
7562           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7563           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7564
7565           if (speed)
7566             {
7567               if (op0_stripped != newop0)
7568                 *cost += extra_cost->alu.log_shift;
7569               else
7570                 *cost += extra_cost->alu.logical;
7571             }
7572
7573           return true;
7574         }
7575       /* MVN.  */
7576       if (speed)
7577         *cost += extra_cost->alu.logical;
7578
7579       return false;
7580
7581     case ZERO_EXTEND:
7582
7583       op0 = XEXP (x, 0);
7584       /* If a value is written in SI mode, then zero extended to DI
7585          mode, the operation will in general be free as a write to
7586          a 'w' register implicitly zeroes the upper bits of an 'x'
7587          register.  However, if this is
7588
7589            (set (reg) (zero_extend (reg)))
7590
7591          we must cost the explicit register move.  */
7592       if (mode == DImode
7593           && GET_MODE (op0) == SImode
7594           && outer == SET)
7595         {
7596           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7597
7598         /* If OP_COST is non-zero, then the cost of the zero extend
7599            is effectively the cost of the inner operation.  Otherwise
7600            we have a MOV instruction and we take the cost from the MOV
7601            itself.  This is true independently of whether we are
7602            optimizing for space or time.  */
7603           if (op_cost)
7604             *cost = op_cost;
7605
7606           return true;
7607         }
7608       else if (MEM_P (op0))
7609         {
7610           /* All loads can zero extend to any size for free.  */
7611           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7612           return true;
7613         }
7614
7615       op0 = aarch64_extend_bitfield_pattern_p (x);
7616       if (op0)
7617         {
7618           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7619           if (speed)
7620             *cost += extra_cost->alu.bfx;
7621           return true;
7622         }
7623
7624       if (speed)
7625         {
7626           if (VECTOR_MODE_P (mode))
7627             {
7628               /* UMOV.  */
7629               *cost += extra_cost->vect.alu;
7630             }
7631           else
7632             {
7633               /* We generate an AND instead of UXTB/UXTH.  */
7634               *cost += extra_cost->alu.logical;
7635             }
7636         }
7637       return false;
7638
7639     case SIGN_EXTEND:
7640       if (MEM_P (XEXP (x, 0)))
7641         {
7642           /* LDRSH.  */
7643           if (speed)
7644             {
7645               rtx address = XEXP (XEXP (x, 0), 0);
7646               *cost += extra_cost->ldst.load_sign_extend;
7647
7648               *cost +=
7649                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7650                                                      0, speed));
7651             }
7652           return true;
7653         }
7654
7655       op0 = aarch64_extend_bitfield_pattern_p (x);
7656       if (op0)
7657         {
7658           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7659           if (speed)
7660             *cost += extra_cost->alu.bfx;
7661           return true;
7662         }
7663
7664       if (speed)
7665         {
7666           if (VECTOR_MODE_P (mode))
7667             *cost += extra_cost->vect.alu;
7668           else
7669             *cost += extra_cost->alu.extend;
7670         }
7671       return false;
7672
7673     case ASHIFT:
7674       op0 = XEXP (x, 0);
7675       op1 = XEXP (x, 1);
7676
7677       if (CONST_INT_P (op1))
7678         {
7679           if (speed)
7680             {
7681               if (VECTOR_MODE_P (mode))
7682                 {
7683                   /* Vector shift (immediate).  */
7684                   *cost += extra_cost->vect.alu;
7685                 }
7686               else
7687                 {
7688                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7689                      aliases.  */
7690                   *cost += extra_cost->alu.shift;
7691                 }
7692             }
7693
7694           /* We can incorporate zero/sign extend for free.  */
7695           if (GET_CODE (op0) == ZERO_EXTEND
7696               || GET_CODE (op0) == SIGN_EXTEND)
7697             op0 = XEXP (op0, 0);
7698
7699           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7700           return true;
7701         }
7702       else
7703         {
7704           if (VECTOR_MODE_P (mode))
7705             {
7706               if (speed)
7707                 /* Vector shift (register).  */
7708                 *cost += extra_cost->vect.alu;
7709             }
7710           else
7711             {
7712               if (speed)
7713                 /* LSLV.  */
7714                 *cost += extra_cost->alu.shift_reg;
7715
7716               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7717                   && CONST_INT_P (XEXP (op1, 1))
7718                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7719                 {
7720                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7721                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7722                      don't recurse into it.  */
7723                   return true;
7724                 }
7725             }
7726           return false;  /* All arguments need to be in registers.  */
7727         }
7728
7729     case ROTATE:
7730     case ROTATERT:
7731     case LSHIFTRT:
7732     case ASHIFTRT:
7733       op0 = XEXP (x, 0);
7734       op1 = XEXP (x, 1);
7735
7736       if (CONST_INT_P (op1))
7737         {
7738           /* ASR (immediate) and friends.  */
7739           if (speed)
7740             {
7741               if (VECTOR_MODE_P (mode))
7742                 *cost += extra_cost->vect.alu;
7743               else
7744                 *cost += extra_cost->alu.shift;
7745             }
7746
7747           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7748           return true;
7749         }
7750       else
7751         {
7752           if (VECTOR_MODE_P (mode))
7753             {
7754               if (speed)
7755                 /* Vector shift (register).  */
7756                 *cost += extra_cost->vect.alu;
7757             }
7758           else
7759             {
7760               if (speed)
7761                 /* ASR (register) and friends.  */
7762                 *cost += extra_cost->alu.shift_reg;
7763
7764               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7765                   && CONST_INT_P (XEXP (op1, 1))
7766                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7767                 {
7768                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7769                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7770                      don't recurse into it.  */
7771                   return true;
7772                 }
7773             }
7774           return false;  /* All arguments need to be in registers.  */
7775         }
7776
7777     case SYMBOL_REF:
7778
7779       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7780           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7781         {
7782           /* LDR.  */
7783           if (speed)
7784             *cost += extra_cost->ldst.load;
7785         }
7786       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7787                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7788         {
7789           /* ADRP, followed by ADD.  */
7790           *cost += COSTS_N_INSNS (1);
7791           if (speed)
7792             *cost += 2 * extra_cost->alu.arith;
7793         }
7794       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7795                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7796         {
7797           /* ADR.  */
7798           if (speed)
7799             *cost += extra_cost->alu.arith;
7800         }
7801
7802       if (flag_pic)
7803         {
7804           /* One extra load instruction, after accessing the GOT.  */
7805           *cost += COSTS_N_INSNS (1);
7806           if (speed)
7807             *cost += extra_cost->ldst.load;
7808         }
7809       return true;
7810
7811     case HIGH:
7812     case LO_SUM:
7813       /* ADRP/ADD (immediate).  */
7814       if (speed)
7815         *cost += extra_cost->alu.arith;
7816       return true;
7817
7818     case ZERO_EXTRACT:
7819     case SIGN_EXTRACT:
7820       /* UBFX/SBFX.  */
7821       if (speed)
7822         {
7823           if (VECTOR_MODE_P (mode))
7824             *cost += extra_cost->vect.alu;
7825           else
7826             *cost += extra_cost->alu.bfx;
7827         }
7828
7829       /* We can trust that the immediates used will be correct (there
7830          are no by-register forms), so we need only cost op0.  */
7831       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7832       return true;
7833
7834     case MULT:
7835       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7836       /* aarch64_rtx_mult_cost always handles recursion to its
7837          operands.  */
7838       return true;
7839
7840     case MOD:
7841     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7842        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7843        an unconditional negate.  This case should only ever be reached through
7844        the set_smod_pow2_cheap check in expmed.c.  */
7845       if (CONST_INT_P (XEXP (x, 1))
7846           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7847           && (mode == SImode || mode == DImode))
7848         {
7849           /* We expand to 4 instructions.  Reset the baseline.  */
7850           *cost = COSTS_N_INSNS (4);
7851
7852           if (speed)
7853             *cost += 2 * extra_cost->alu.logical
7854                      + 2 * extra_cost->alu.arith;
7855
7856           return true;
7857         }
7858
7859     /* Fall-through.  */
7860     case UMOD:
7861       if (speed)
7862         {
7863           /* Slighly prefer UMOD over SMOD.  */
7864           if (VECTOR_MODE_P (mode))
7865             *cost += extra_cost->vect.alu;
7866           else if (GET_MODE_CLASS (mode) == MODE_INT)
7867             *cost += (extra_cost->mult[mode == DImode].add
7868                       + extra_cost->mult[mode == DImode].idiv
7869                       + (code == MOD ? 1 : 0));
7870         }
7871       return false;  /* All arguments need to be in registers.  */
7872
7873     case DIV:
7874     case UDIV:
7875     case SQRT:
7876       if (speed)
7877         {
7878           if (VECTOR_MODE_P (mode))
7879             *cost += extra_cost->vect.alu;
7880           else if (GET_MODE_CLASS (mode) == MODE_INT)
7881             /* There is no integer SQRT, so only DIV and UDIV can get
7882                here.  */
7883             *cost += (extra_cost->mult[mode == DImode].idiv
7884                      /* Slighly prefer UDIV over SDIV.  */
7885                      + (code == DIV ? 1 : 0));
7886           else
7887             *cost += extra_cost->fp[mode == DFmode].div;
7888         }
7889       return false;  /* All arguments need to be in registers.  */
7890
7891     case IF_THEN_ELSE:
7892       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7893                                          XEXP (x, 2), cost, speed);
7894
7895     case EQ:
7896     case NE:
7897     case GT:
7898     case GTU:
7899     case LT:
7900     case LTU:
7901     case GE:
7902     case GEU:
7903     case LE:
7904     case LEU:
7905
7906       return false; /* All arguments must be in registers.  */
7907
7908     case FMA:
7909       op0 = XEXP (x, 0);
7910       op1 = XEXP (x, 1);
7911       op2 = XEXP (x, 2);
7912
7913       if (speed)
7914         {
7915           if (VECTOR_MODE_P (mode))
7916             *cost += extra_cost->vect.alu;
7917           else
7918             *cost += extra_cost->fp[mode == DFmode].fma;
7919         }
7920
7921       /* FMSUB, FNMADD, and FNMSUB are free.  */
7922       if (GET_CODE (op0) == NEG)
7923         op0 = XEXP (op0, 0);
7924
7925       if (GET_CODE (op2) == NEG)
7926         op2 = XEXP (op2, 0);
7927
7928       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7929          and the by-element operand as operand 0.  */
7930       if (GET_CODE (op1) == NEG)
7931         op1 = XEXP (op1, 0);
7932
7933       /* Catch vector-by-element operations.  The by-element operand can
7934          either be (vec_duplicate (vec_select (x))) or just
7935          (vec_select (x)), depending on whether we are multiplying by
7936          a vector or a scalar.
7937
7938          Canonicalization is not very good in these cases, FMA4 will put the
7939          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7940       if (GET_CODE (op0) == VEC_DUPLICATE)
7941         op0 = XEXP (op0, 0);
7942       else if (GET_CODE (op1) == VEC_DUPLICATE)
7943         op1 = XEXP (op1, 0);
7944
7945       if (GET_CODE (op0) == VEC_SELECT)
7946         op0 = XEXP (op0, 0);
7947       else if (GET_CODE (op1) == VEC_SELECT)
7948         op1 = XEXP (op1, 0);
7949
7950       /* If the remaining parameters are not registers,
7951          get the cost to put them into registers.  */
7952       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7953       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7954       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7955       return true;
7956
7957     case FLOAT:
7958     case UNSIGNED_FLOAT:
7959       if (speed)
7960         *cost += extra_cost->fp[mode == DFmode].fromint;
7961       return false;
7962
7963     case FLOAT_EXTEND:
7964       if (speed)
7965         {
7966           if (VECTOR_MODE_P (mode))
7967             {
7968               /*Vector truncate.  */
7969               *cost += extra_cost->vect.alu;
7970             }
7971           else
7972             *cost += extra_cost->fp[mode == DFmode].widen;
7973         }
7974       return false;
7975
7976     case FLOAT_TRUNCATE:
7977       if (speed)
7978         {
7979           if (VECTOR_MODE_P (mode))
7980             {
7981               /*Vector conversion.  */
7982               *cost += extra_cost->vect.alu;
7983             }
7984           else
7985             *cost += extra_cost->fp[mode == DFmode].narrow;
7986         }
7987       return false;
7988
7989     case FIX:
7990     case UNSIGNED_FIX:
7991       x = XEXP (x, 0);
7992       /* Strip the rounding part.  They will all be implemented
7993          by the fcvt* family of instructions anyway.  */
7994       if (GET_CODE (x) == UNSPEC)
7995         {
7996           unsigned int uns_code = XINT (x, 1);
7997
7998           if (uns_code == UNSPEC_FRINTA
7999               || uns_code == UNSPEC_FRINTM
8000               || uns_code == UNSPEC_FRINTN
8001               || uns_code == UNSPEC_FRINTP
8002               || uns_code == UNSPEC_FRINTZ)
8003             x = XVECEXP (x, 0, 0);
8004         }
8005
8006       if (speed)
8007         {
8008           if (VECTOR_MODE_P (mode))
8009             *cost += extra_cost->vect.alu;
8010           else
8011             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8012         }
8013
8014       /* We can combine fmul by a power of 2 followed by a fcvt into a single
8015          fixed-point fcvt.  */
8016       if (GET_CODE (x) == MULT
8017           && ((VECTOR_MODE_P (mode)
8018                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8019               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8020         {
8021           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8022                              0, speed);
8023           return true;
8024         }
8025
8026       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8027       return true;
8028
8029     case ABS:
8030       if (VECTOR_MODE_P (mode))
8031         {
8032           /* ABS (vector).  */
8033           if (speed)
8034             *cost += extra_cost->vect.alu;
8035         }
8036       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8037         {
8038           op0 = XEXP (x, 0);
8039
8040           /* FABD, which is analogous to FADD.  */
8041           if (GET_CODE (op0) == MINUS)
8042             {
8043               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8044               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8045               if (speed)
8046                 *cost += extra_cost->fp[mode == DFmode].addsub;
8047
8048               return true;
8049             }
8050           /* Simple FABS is analogous to FNEG.  */
8051           if (speed)
8052             *cost += extra_cost->fp[mode == DFmode].neg;
8053         }
8054       else
8055         {
8056           /* Integer ABS will either be split to
8057              two arithmetic instructions, or will be an ABS
8058              (scalar), which we don't model.  */
8059           *cost = COSTS_N_INSNS (2);
8060           if (speed)
8061             *cost += 2 * extra_cost->alu.arith;
8062         }
8063       return false;
8064
8065     case SMAX:
8066     case SMIN:
8067       if (speed)
8068         {
8069           if (VECTOR_MODE_P (mode))
8070             *cost += extra_cost->vect.alu;
8071           else
8072             {
8073               /* FMAXNM/FMINNM/FMAX/FMIN.
8074                  TODO: This may not be accurate for all implementations, but
8075                  we do not model this in the cost tables.  */
8076               *cost += extra_cost->fp[mode == DFmode].addsub;
8077             }
8078         }
8079       return false;
8080
8081     case UNSPEC:
8082       /* The floating point round to integer frint* instructions.  */
8083       if (aarch64_frint_unspec_p (XINT (x, 1)))
8084         {
8085           if (speed)
8086             *cost += extra_cost->fp[mode == DFmode].roundint;
8087
8088           return false;
8089         }
8090
8091       if (XINT (x, 1) == UNSPEC_RBIT)
8092         {
8093           if (speed)
8094             *cost += extra_cost->alu.rev;
8095
8096           return false;
8097         }
8098       break;
8099
8100     case TRUNCATE:
8101
8102       /* Decompose <su>muldi3_highpart.  */
8103       if (/* (truncate:DI  */
8104           mode == DImode
8105           /*   (lshiftrt:TI  */
8106           && GET_MODE (XEXP (x, 0)) == TImode
8107           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8108           /*      (mult:TI  */
8109           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8110           /*        (ANY_EXTEND:TI (reg:DI))
8111                     (ANY_EXTEND:TI (reg:DI)))  */
8112           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8113                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8114               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8115                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8116           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8117           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8118           /*     (const_int 64)  */
8119           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8120           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8121         {
8122           /* UMULH/SMULH.  */
8123           if (speed)
8124             *cost += extra_cost->mult[mode == DImode].extend;
8125           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8126                              mode, MULT, 0, speed);
8127           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8128                              mode, MULT, 1, speed);
8129           return true;
8130         }
8131
8132       /* Fall through.  */
8133     default:
8134       break;
8135     }
8136
8137   if (dump_file
8138       && flag_aarch64_verbose_cost)
8139     fprintf (dump_file,
8140       "\nFailed to cost RTX.  Assuming default cost.\n");
8141
8142   return true;
8143 }
8144
8145 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8146    calculated for X.  This cost is stored in *COST.  Returns true
8147    if the total cost of X was calculated.  */
8148 static bool
8149 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8150                    int param, int *cost, bool speed)
8151 {
8152   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8153
8154   if (dump_file
8155       && flag_aarch64_verbose_cost)
8156     {
8157       print_rtl_single (dump_file, x);
8158       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8159                speed ? "Hot" : "Cold",
8160                *cost, result ? "final" : "partial");
8161     }
8162
8163   return result;
8164 }
8165
8166 static int
8167 aarch64_register_move_cost (machine_mode mode,
8168                             reg_class_t from_i, reg_class_t to_i)
8169 {
8170   enum reg_class from = (enum reg_class) from_i;
8171   enum reg_class to = (enum reg_class) to_i;
8172   const struct cpu_regmove_cost *regmove_cost
8173     = aarch64_tune_params.regmove_cost;
8174
8175   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8176   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8177     to = GENERAL_REGS;
8178
8179   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8180     from = GENERAL_REGS;
8181
8182   /* Moving between GPR and stack cost is the same as GP2GP.  */
8183   if ((from == GENERAL_REGS && to == STACK_REG)
8184       || (to == GENERAL_REGS && from == STACK_REG))
8185     return regmove_cost->GP2GP;
8186
8187   /* To/From the stack register, we move via the gprs.  */
8188   if (to == STACK_REG || from == STACK_REG)
8189     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8190             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8191
8192   if (GET_MODE_SIZE (mode) == 16)
8193     {
8194       /* 128-bit operations on general registers require 2 instructions.  */
8195       if (from == GENERAL_REGS && to == GENERAL_REGS)
8196         return regmove_cost->GP2GP * 2;
8197       else if (from == GENERAL_REGS)
8198         return regmove_cost->GP2FP * 2;
8199       else if (to == GENERAL_REGS)
8200         return regmove_cost->FP2GP * 2;
8201
8202       /* When AdvSIMD instructions are disabled it is not possible to move
8203          a 128-bit value directly between Q registers.  This is handled in
8204          secondary reload.  A general register is used as a scratch to move
8205          the upper DI value and the lower DI value is moved directly,
8206          hence the cost is the sum of three moves. */
8207       if (! TARGET_SIMD)
8208         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8209
8210       return regmove_cost->FP2FP;
8211     }
8212
8213   if (from == GENERAL_REGS && to == GENERAL_REGS)
8214     return regmove_cost->GP2GP;
8215   else if (from == GENERAL_REGS)
8216     return regmove_cost->GP2FP;
8217   else if (to == GENERAL_REGS)
8218     return regmove_cost->FP2GP;
8219
8220   return regmove_cost->FP2FP;
8221 }
8222
8223 static int
8224 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8225                           reg_class_t rclass ATTRIBUTE_UNUSED,
8226                           bool in ATTRIBUTE_UNUSED)
8227 {
8228   return aarch64_tune_params.memmov_cost;
8229 }
8230
8231 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8232    to optimize 1.0/sqrt.  */
8233
8234 static bool
8235 use_rsqrt_p (machine_mode mode)
8236 {
8237   return (!flag_trapping_math
8238           && flag_unsafe_math_optimizations
8239           && ((aarch64_tune_params.approx_modes->recip_sqrt
8240                & AARCH64_APPROX_MODE (mode))
8241               || flag_mrecip_low_precision_sqrt));
8242 }
8243
8244 /* Function to decide when to use the approximate reciprocal square root
8245    builtin.  */
8246
8247 static tree
8248 aarch64_builtin_reciprocal (tree fndecl)
8249 {
8250   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8251
8252   if (!use_rsqrt_p (mode))
8253     return NULL_TREE;
8254   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8255 }
8256
8257 typedef rtx (*rsqrte_type) (rtx, rtx);
8258
8259 /* Select reciprocal square root initial estimate insn depending on machine
8260    mode.  */
8261
8262 static rsqrte_type
8263 get_rsqrte_type (machine_mode mode)
8264 {
8265   switch (mode)
8266   {
8267     case E_DFmode:   return gen_aarch64_rsqrtedf;
8268     case E_SFmode:   return gen_aarch64_rsqrtesf;
8269     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8270     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8271     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8272     default: gcc_unreachable ();
8273   }
8274 }
8275
8276 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8277
8278 /* Select reciprocal square root series step insn depending on machine mode.  */
8279
8280 static rsqrts_type
8281 get_rsqrts_type (machine_mode mode)
8282 {
8283   switch (mode)
8284   {
8285     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8286     case E_SFmode:   return gen_aarch64_rsqrtssf;
8287     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8288     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8289     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8290     default: gcc_unreachable ();
8291   }
8292 }
8293
8294 /* Emit instruction sequence to compute either the approximate square root
8295    or its approximate reciprocal, depending on the flag RECP, and return
8296    whether the sequence was emitted or not.  */
8297
8298 bool
8299 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8300 {
8301   machine_mode mode = GET_MODE (dst);
8302
8303   if (GET_MODE_INNER (mode) == HFmode)
8304     {
8305       gcc_assert (!recp);
8306       return false;
8307     }
8308
8309   if (!recp)
8310     {
8311       if (!(flag_mlow_precision_sqrt
8312             || (aarch64_tune_params.approx_modes->sqrt
8313                 & AARCH64_APPROX_MODE (mode))))
8314         return false;
8315
8316       if (flag_finite_math_only
8317           || flag_trapping_math
8318           || !flag_unsafe_math_optimizations
8319           || optimize_function_for_size_p (cfun))
8320         return false;
8321     }
8322   else
8323     /* Caller assumes we cannot fail.  */
8324     gcc_assert (use_rsqrt_p (mode));
8325
8326   machine_mode mmsk = mode_for_int_vector (mode).require ();
8327   rtx xmsk = gen_reg_rtx (mmsk);
8328   if (!recp)
8329     /* When calculating the approximate square root, compare the
8330        argument with 0.0 and create a mask.  */
8331     emit_insn (gen_rtx_SET (xmsk,
8332                             gen_rtx_NEG (mmsk,
8333                                          gen_rtx_EQ (mmsk, src,
8334                                                      CONST0_RTX (mode)))));
8335
8336   /* Estimate the approximate reciprocal square root.  */
8337   rtx xdst = gen_reg_rtx (mode);
8338   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8339
8340   /* Iterate over the series twice for SF and thrice for DF.  */
8341   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8342
8343   /* Optionally iterate over the series once less for faster performance
8344      while sacrificing the accuracy.  */
8345   if ((recp && flag_mrecip_low_precision_sqrt)
8346       || (!recp && flag_mlow_precision_sqrt))
8347     iterations--;
8348
8349   /* Iterate over the series to calculate the approximate reciprocal square
8350      root.  */
8351   rtx x1 = gen_reg_rtx (mode);
8352   while (iterations--)
8353     {
8354       rtx x2 = gen_reg_rtx (mode);
8355       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8356
8357       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8358
8359       if (iterations > 0)
8360         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8361     }
8362
8363   if (!recp)
8364     {
8365       /* Qualify the approximate reciprocal square root when the argument is
8366          0.0 by squashing the intermediary result to 0.0.  */
8367       rtx xtmp = gen_reg_rtx (mmsk);
8368       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8369                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8370       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8371
8372       /* Calculate the approximate square root.  */
8373       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8374     }
8375
8376   /* Finalize the approximation.  */
8377   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8378
8379   return true;
8380 }
8381
8382 typedef rtx (*recpe_type) (rtx, rtx);
8383
8384 /* Select reciprocal initial estimate insn depending on machine mode.  */
8385
8386 static recpe_type
8387 get_recpe_type (machine_mode mode)
8388 {
8389   switch (mode)
8390   {
8391     case E_SFmode:   return (gen_aarch64_frecpesf);
8392     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8393     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8394     case E_DFmode:   return (gen_aarch64_frecpedf);
8395     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8396     default:         gcc_unreachable ();
8397   }
8398 }
8399
8400 typedef rtx (*recps_type) (rtx, rtx, rtx);
8401
8402 /* Select reciprocal series step insn depending on machine mode.  */
8403
8404 static recps_type
8405 get_recps_type (machine_mode mode)
8406 {
8407   switch (mode)
8408   {
8409     case E_SFmode:   return (gen_aarch64_frecpssf);
8410     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8411     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8412     case E_DFmode:   return (gen_aarch64_frecpsdf);
8413     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8414     default:         gcc_unreachable ();
8415   }
8416 }
8417
8418 /* Emit the instruction sequence to compute the approximation for the division
8419    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8420
8421 bool
8422 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8423 {
8424   machine_mode mode = GET_MODE (quo);
8425
8426   if (GET_MODE_INNER (mode) == HFmode)
8427     return false;
8428
8429   bool use_approx_division_p = (flag_mlow_precision_div
8430                                 || (aarch64_tune_params.approx_modes->division
8431                                     & AARCH64_APPROX_MODE (mode)));
8432
8433   if (!flag_finite_math_only
8434       || flag_trapping_math
8435       || !flag_unsafe_math_optimizations
8436       || optimize_function_for_size_p (cfun)
8437       || !use_approx_division_p)
8438     return false;
8439
8440   /* Estimate the approximate reciprocal.  */
8441   rtx xrcp = gen_reg_rtx (mode);
8442   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8443
8444   /* Iterate over the series twice for SF and thrice for DF.  */
8445   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8446
8447   /* Optionally iterate over the series once less for faster performance,
8448      while sacrificing the accuracy.  */
8449   if (flag_mlow_precision_div)
8450     iterations--;
8451
8452   /* Iterate over the series to calculate the approximate reciprocal.  */
8453   rtx xtmp = gen_reg_rtx (mode);
8454   while (iterations--)
8455     {
8456       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8457
8458       if (iterations > 0)
8459         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8460     }
8461
8462   if (num != CONST1_RTX (mode))
8463     {
8464       /* As the approximate reciprocal of DEN is already calculated, only
8465          calculate the approximate division when NUM is not 1.0.  */
8466       rtx xnum = force_reg (mode, num);
8467       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8468     }
8469
8470   /* Finalize the approximation.  */
8471   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8472   return true;
8473 }
8474
8475 /* Return the number of instructions that can be issued per cycle.  */
8476 static int
8477 aarch64_sched_issue_rate (void)
8478 {
8479   return aarch64_tune_params.issue_rate;
8480 }
8481
8482 static int
8483 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8484 {
8485   int issue_rate = aarch64_sched_issue_rate ();
8486
8487   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8488 }
8489
8490
8491 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8492    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8493    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8494
8495 static int
8496 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8497                                                     int ready_index)
8498 {
8499   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8500 }
8501
8502
8503 /* Vectorizer cost model target hooks.  */
8504
8505 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8506 static int
8507 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8508                                     tree vectype,
8509                                     int misalign ATTRIBUTE_UNUSED)
8510 {
8511   unsigned elements;
8512   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8513   bool fp = false;
8514
8515   if (vectype != NULL)
8516     fp = FLOAT_TYPE_P (vectype);
8517
8518   switch (type_of_cost)
8519     {
8520       case scalar_stmt:
8521         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8522
8523       case scalar_load:
8524         return costs->scalar_load_cost;
8525
8526       case scalar_store:
8527         return costs->scalar_store_cost;
8528
8529       case vector_stmt:
8530         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8531
8532       case vector_load:
8533         return costs->vec_align_load_cost;
8534
8535       case vector_store:
8536         return costs->vec_store_cost;
8537
8538       case vec_to_scalar:
8539         return costs->vec_to_scalar_cost;
8540
8541       case scalar_to_vec:
8542         return costs->scalar_to_vec_cost;
8543
8544       case unaligned_load:
8545         return costs->vec_unalign_load_cost;
8546
8547       case unaligned_store:
8548         return costs->vec_unalign_store_cost;
8549
8550       case cond_branch_taken:
8551         return costs->cond_taken_branch_cost;
8552
8553       case cond_branch_not_taken:
8554         return costs->cond_not_taken_branch_cost;
8555
8556       case vec_perm:
8557         return costs->vec_permute_cost;
8558
8559       case vec_promote_demote:
8560         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8561
8562       case vec_construct:
8563         elements = TYPE_VECTOR_SUBPARTS (vectype);
8564         return elements / 2 + 1;
8565
8566       default:
8567         gcc_unreachable ();
8568     }
8569 }
8570
8571 /* Implement targetm.vectorize.add_stmt_cost.  */
8572 static unsigned
8573 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8574                        struct _stmt_vec_info *stmt_info, int misalign,
8575                        enum vect_cost_model_location where)
8576 {
8577   unsigned *cost = (unsigned *) data;
8578   unsigned retval = 0;
8579
8580   if (flag_vect_cost_model)
8581     {
8582       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8583       int stmt_cost =
8584             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8585
8586       /* Statements in an inner loop relative to the loop being
8587          vectorized are weighted more heavily.  The value here is
8588          arbitrary and could potentially be improved with analysis.  */
8589       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8590         count *= 50; /*  FIXME  */
8591
8592       retval = (unsigned) (count * stmt_cost);
8593       cost[where] += retval;
8594     }
8595
8596   return retval;
8597 }
8598
8599 static void initialize_aarch64_code_model (struct gcc_options *);
8600
8601 /* Parse the TO_PARSE string and put the architecture struct that it
8602    selects into RES and the architectural features into ISA_FLAGS.
8603    Return an aarch64_parse_opt_result describing the parse result.
8604    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8605
8606 static enum aarch64_parse_opt_result
8607 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8608                     unsigned long *isa_flags)
8609 {
8610   char *ext;
8611   const struct processor *arch;
8612   char *str = (char *) alloca (strlen (to_parse) + 1);
8613   size_t len;
8614
8615   strcpy (str, to_parse);
8616
8617   ext = strchr (str, '+');
8618
8619   if (ext != NULL)
8620     len = ext - str;
8621   else
8622     len = strlen (str);
8623
8624   if (len == 0)
8625     return AARCH64_PARSE_MISSING_ARG;
8626
8627
8628   /* Loop through the list of supported ARCHes to find a match.  */
8629   for (arch = all_architectures; arch->name != NULL; arch++)
8630     {
8631       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8632         {
8633           unsigned long isa_temp = arch->flags;
8634
8635           if (ext != NULL)
8636             {
8637               /* TO_PARSE string contains at least one extension.  */
8638               enum aarch64_parse_opt_result ext_res
8639                 = aarch64_parse_extension (ext, &isa_temp);
8640
8641               if (ext_res != AARCH64_PARSE_OK)
8642                 return ext_res;
8643             }
8644           /* Extension parsing was successful.  Confirm the result
8645              arch and ISA flags.  */
8646           *res = arch;
8647           *isa_flags = isa_temp;
8648           return AARCH64_PARSE_OK;
8649         }
8650     }
8651
8652   /* ARCH name not found in list.  */
8653   return AARCH64_PARSE_INVALID_ARG;
8654 }
8655
8656 /* Parse the TO_PARSE string and put the result tuning in RES and the
8657    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8658    describing the parse result.  If there is an error parsing, RES and
8659    ISA_FLAGS are left unchanged.  */
8660
8661 static enum aarch64_parse_opt_result
8662 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8663                    unsigned long *isa_flags)
8664 {
8665   char *ext;
8666   const struct processor *cpu;
8667   char *str = (char *) alloca (strlen (to_parse) + 1);
8668   size_t len;
8669
8670   strcpy (str, to_parse);
8671
8672   ext = strchr (str, '+');
8673
8674   if (ext != NULL)
8675     len = ext - str;
8676   else
8677     len = strlen (str);
8678
8679   if (len == 0)
8680     return AARCH64_PARSE_MISSING_ARG;
8681
8682
8683   /* Loop through the list of supported CPUs to find a match.  */
8684   for (cpu = all_cores; cpu->name != NULL; cpu++)
8685     {
8686       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8687         {
8688           unsigned long isa_temp = cpu->flags;
8689
8690
8691           if (ext != NULL)
8692             {
8693               /* TO_PARSE string contains at least one extension.  */
8694               enum aarch64_parse_opt_result ext_res
8695                 = aarch64_parse_extension (ext, &isa_temp);
8696
8697               if (ext_res != AARCH64_PARSE_OK)
8698                 return ext_res;
8699             }
8700           /* Extension parsing was successfull.  Confirm the result
8701              cpu and ISA flags.  */
8702           *res = cpu;
8703           *isa_flags = isa_temp;
8704           return AARCH64_PARSE_OK;
8705         }
8706     }
8707
8708   /* CPU name not found in list.  */
8709   return AARCH64_PARSE_INVALID_ARG;
8710 }
8711
8712 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8713    Return an aarch64_parse_opt_result describing the parse result.
8714    If the parsing fails the RES does not change.  */
8715
8716 static enum aarch64_parse_opt_result
8717 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8718 {
8719   const struct processor *cpu;
8720   char *str = (char *) alloca (strlen (to_parse) + 1);
8721
8722   strcpy (str, to_parse);
8723
8724   /* Loop through the list of supported CPUs to find a match.  */
8725   for (cpu = all_cores; cpu->name != NULL; cpu++)
8726     {
8727       if (strcmp (cpu->name, str) == 0)
8728         {
8729           *res = cpu;
8730           return AARCH64_PARSE_OK;
8731         }
8732     }
8733
8734   /* CPU name not found in list.  */
8735   return AARCH64_PARSE_INVALID_ARG;
8736 }
8737
8738 /* Parse TOKEN, which has length LENGTH to see if it is an option
8739    described in FLAG.  If it is, return the index bit for that fusion type.
8740    If not, error (printing OPTION_NAME) and return zero.  */
8741
8742 static unsigned int
8743 aarch64_parse_one_option_token (const char *token,
8744                                 size_t length,
8745                                 const struct aarch64_flag_desc *flag,
8746                                 const char *option_name)
8747 {
8748   for (; flag->name != NULL; flag++)
8749     {
8750       if (length == strlen (flag->name)
8751           && !strncmp (flag->name, token, length))
8752         return flag->flag;
8753     }
8754
8755   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8756   return 0;
8757 }
8758
8759 /* Parse OPTION which is a comma-separated list of flags to enable.
8760    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8761    default state we inherit from the CPU tuning structures.  OPTION_NAME
8762    gives the top-level option we are parsing in the -moverride string,
8763    for use in error messages.  */
8764
8765 static unsigned int
8766 aarch64_parse_boolean_options (const char *option,
8767                                const struct aarch64_flag_desc *flags,
8768                                unsigned int initial_state,
8769                                const char *option_name)
8770 {
8771   const char separator = '.';
8772   const char* specs = option;
8773   const char* ntoken = option;
8774   unsigned int found_flags = initial_state;
8775
8776   while ((ntoken = strchr (specs, separator)))
8777     {
8778       size_t token_length = ntoken - specs;
8779       unsigned token_ops = aarch64_parse_one_option_token (specs,
8780                                                            token_length,
8781                                                            flags,
8782                                                            option_name);
8783       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8784          in the token stream, reset the supported operations.  So:
8785
8786            adrp+add.cmp+branch.none.adrp+add
8787
8788            would have the result of turning on only adrp+add fusion.  */
8789       if (!token_ops)
8790         found_flags = 0;
8791
8792       found_flags |= token_ops;
8793       specs = ++ntoken;
8794     }
8795
8796   /* We ended with a comma, print something.  */
8797   if (!(*specs))
8798     {
8799       error ("%s string ill-formed\n", option_name);
8800       return 0;
8801     }
8802
8803   /* We still have one more token to parse.  */
8804   size_t token_length = strlen (specs);
8805   unsigned token_ops = aarch64_parse_one_option_token (specs,
8806                                                        token_length,
8807                                                        flags,
8808                                                        option_name);
8809    if (!token_ops)
8810      found_flags = 0;
8811
8812   found_flags |= token_ops;
8813   return found_flags;
8814 }
8815
8816 /* Support for overriding instruction fusion.  */
8817
8818 static void
8819 aarch64_parse_fuse_string (const char *fuse_string,
8820                             struct tune_params *tune)
8821 {
8822   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8823                                                      aarch64_fusible_pairs,
8824                                                      tune->fusible_ops,
8825                                                      "fuse=");
8826 }
8827
8828 /* Support for overriding other tuning flags.  */
8829
8830 static void
8831 aarch64_parse_tune_string (const char *tune_string,
8832                             struct tune_params *tune)
8833 {
8834   tune->extra_tuning_flags
8835     = aarch64_parse_boolean_options (tune_string,
8836                                      aarch64_tuning_flags,
8837                                      tune->extra_tuning_flags,
8838                                      "tune=");
8839 }
8840
8841 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8842    we understand.  If it is, extract the option string and handoff to
8843    the appropriate function.  */
8844
8845 void
8846 aarch64_parse_one_override_token (const char* token,
8847                                   size_t length,
8848                                   struct tune_params *tune)
8849 {
8850   const struct aarch64_tuning_override_function *fn
8851     = aarch64_tuning_override_functions;
8852
8853   const char *option_part = strchr (token, '=');
8854   if (!option_part)
8855     {
8856       error ("tuning string missing in option (%s)", token);
8857       return;
8858     }
8859
8860   /* Get the length of the option name.  */
8861   length = option_part - token;
8862   /* Skip the '=' to get to the option string.  */
8863   option_part++;
8864
8865   for (; fn->name != NULL; fn++)
8866     {
8867       if (!strncmp (fn->name, token, length))
8868         {
8869           fn->parse_override (option_part, tune);
8870           return;
8871         }
8872     }
8873
8874   error ("unknown tuning option (%s)",token);
8875   return;
8876 }
8877
8878 /* A checking mechanism for the implementation of the tls size.  */
8879
8880 static void
8881 initialize_aarch64_tls_size (struct gcc_options *opts)
8882 {
8883   if (aarch64_tls_size == 0)
8884     aarch64_tls_size = 24;
8885
8886   switch (opts->x_aarch64_cmodel_var)
8887     {
8888     case AARCH64_CMODEL_TINY:
8889       /* Both the default and maximum TLS size allowed under tiny is 1M which
8890          needs two instructions to address, so we clamp the size to 24.  */
8891       if (aarch64_tls_size > 24)
8892         aarch64_tls_size = 24;
8893       break;
8894     case AARCH64_CMODEL_SMALL:
8895       /* The maximum TLS size allowed under small is 4G.  */
8896       if (aarch64_tls_size > 32)
8897         aarch64_tls_size = 32;
8898       break;
8899     case AARCH64_CMODEL_LARGE:
8900       /* The maximum TLS size allowed under large is 16E.
8901          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8902       if (aarch64_tls_size > 48)
8903         aarch64_tls_size = 48;
8904       break;
8905     default:
8906       gcc_unreachable ();
8907     }
8908
8909   return;
8910 }
8911
8912 /* Parse STRING looking for options in the format:
8913      string     :: option:string
8914      option     :: name=substring
8915      name       :: {a-z}
8916      substring  :: defined by option.  */
8917
8918 static void
8919 aarch64_parse_override_string (const char* input_string,
8920                                struct tune_params* tune)
8921 {
8922   const char separator = ':';
8923   size_t string_length = strlen (input_string) + 1;
8924   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8925   char *string = string_root;
8926   strncpy (string, input_string, string_length);
8927   string[string_length - 1] = '\0';
8928
8929   char* ntoken = string;
8930
8931   while ((ntoken = strchr (string, separator)))
8932     {
8933       size_t token_length = ntoken - string;
8934       /* Make this substring look like a string.  */
8935       *ntoken = '\0';
8936       aarch64_parse_one_override_token (string, token_length, tune);
8937       string = ++ntoken;
8938     }
8939
8940   /* One last option to parse.  */
8941   aarch64_parse_one_override_token (string, strlen (string), tune);
8942   free (string_root);
8943 }
8944
8945
8946 static void
8947 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8948 {
8949   /* The logic here is that if we are disabling all frame pointer generation
8950      then we do not need to disable leaf frame pointer generation as a
8951      separate operation.  But if we are *only* disabling leaf frame pointer
8952      generation then we set flag_omit_frame_pointer to true, but in
8953      aarch64_frame_pointer_required we return false only for leaf functions.
8954
8955      PR 70044: We have to be careful about being called multiple times for the
8956      same function.  Once we have decided to set flag_omit_frame_pointer just
8957      so that we can omit leaf frame pointers, we must then not interpret a
8958      second call as meaning that all frame pointer generation should be
8959      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8960      non-zero value.  */
8961   if (opts->x_flag_omit_frame_pointer == 2)
8962     opts->x_flag_omit_frame_pointer = 0;
8963
8964   if (opts->x_flag_omit_frame_pointer)
8965     opts->x_flag_omit_leaf_frame_pointer = false;
8966   else if (opts->x_flag_omit_leaf_frame_pointer)
8967     opts->x_flag_omit_frame_pointer = 2;
8968
8969   /* If not optimizing for size, set the default
8970      alignment to what the target wants.  */
8971   if (!opts->x_optimize_size)
8972     {
8973       if (opts->x_align_loops <= 0)
8974         opts->x_align_loops = aarch64_tune_params.loop_align;
8975       if (opts->x_align_jumps <= 0)
8976         opts->x_align_jumps = aarch64_tune_params.jump_align;
8977       if (opts->x_align_functions <= 0)
8978         opts->x_align_functions = aarch64_tune_params.function_align;
8979     }
8980
8981   /* We default to no pc-relative literal loads.  */
8982
8983   aarch64_pcrelative_literal_loads = false;
8984
8985   /* If -mpc-relative-literal-loads is set on the command line, this
8986      implies that the user asked for PC relative literal loads.  */
8987   if (opts->x_pcrelative_literal_loads == 1)
8988     aarch64_pcrelative_literal_loads = true;
8989
8990   /* This is PR70113. When building the Linux kernel with
8991      CONFIG_ARM64_ERRATUM_843419, support for relocations
8992      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8993      removed from the kernel to avoid loading objects with possibly
8994      offending sequences.  Without -mpc-relative-literal-loads we would
8995      generate such relocations, preventing the kernel build from
8996      succeeding.  */
8997   if (opts->x_pcrelative_literal_loads == 2
8998       && TARGET_FIX_ERR_A53_843419)
8999     aarch64_pcrelative_literal_loads = true;
9000
9001   /* In the tiny memory model it makes no sense to disallow PC relative
9002      literal pool loads.  */
9003   if (aarch64_cmodel == AARCH64_CMODEL_TINY
9004       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9005     aarch64_pcrelative_literal_loads = true;
9006
9007   /* When enabling the lower precision Newton series for the square root, also
9008      enable it for the reciprocal square root, since the latter is an
9009      intermediary step for the former.  */
9010   if (flag_mlow_precision_sqrt)
9011     flag_mrecip_low_precision_sqrt = true;
9012 }
9013
9014 /* 'Unpack' up the internal tuning structs and update the options
9015     in OPTS.  The caller must have set up selected_tune and selected_arch
9016     as all the other target-specific codegen decisions are
9017     derived from them.  */
9018
9019 void
9020 aarch64_override_options_internal (struct gcc_options *opts)
9021 {
9022   aarch64_tune_flags = selected_tune->flags;
9023   aarch64_tune = selected_tune->sched_core;
9024   /* Make a copy of the tuning parameters attached to the core, which
9025      we may later overwrite.  */
9026   aarch64_tune_params = *(selected_tune->tune);
9027   aarch64_architecture_version = selected_arch->architecture_version;
9028
9029   if (opts->x_aarch64_override_tune_string)
9030     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9031                                   &aarch64_tune_params);
9032
9033   /* This target defaults to strict volatile bitfields.  */
9034   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9035     opts->x_flag_strict_volatile_bitfields = 1;
9036
9037   initialize_aarch64_code_model (opts);
9038   initialize_aarch64_tls_size (opts);
9039
9040   int queue_depth = 0;
9041   switch (aarch64_tune_params.autoprefetcher_model)
9042     {
9043       case tune_params::AUTOPREFETCHER_OFF:
9044         queue_depth = -1;
9045         break;
9046       case tune_params::AUTOPREFETCHER_WEAK:
9047         queue_depth = 0;
9048         break;
9049       case tune_params::AUTOPREFETCHER_STRONG:
9050         queue_depth = max_insn_queue_index + 1;
9051         break;
9052       default:
9053         gcc_unreachable ();
9054     }
9055
9056   /* We don't mind passing in global_options_set here as we don't use
9057      the *options_set structs anyway.  */
9058   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9059                          queue_depth,
9060                          opts->x_param_values,
9061                          global_options_set.x_param_values);
9062
9063   /* Set up parameters to be used in prefetching algorithm.  Do not
9064      override the defaults unless we are tuning for a core we have
9065      researched values for.  */
9066   if (aarch64_tune_params.prefetch->num_slots > 0)
9067     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9068                            aarch64_tune_params.prefetch->num_slots,
9069                            opts->x_param_values,
9070                            global_options_set.x_param_values);
9071   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9072     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9073                            aarch64_tune_params.prefetch->l1_cache_size,
9074                            opts->x_param_values,
9075                            global_options_set.x_param_values);
9076   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9077     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9078                            aarch64_tune_params.prefetch->l1_cache_line_size,
9079                            opts->x_param_values,
9080                            global_options_set.x_param_values);
9081   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9082     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9083                            aarch64_tune_params.prefetch->l2_cache_size,
9084                            opts->x_param_values,
9085                            global_options_set.x_param_values);
9086
9087   /* Enable sw prefetching at specified optimization level for
9088      CPUS that have prefetch.  Lower optimization level threshold by 1
9089      when profiling is enabled.  */
9090   if (opts->x_flag_prefetch_loop_arrays < 0
9091       && !opts->x_optimize_size
9092       && aarch64_tune_params.prefetch->default_opt_level >= 0
9093       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9094     opts->x_flag_prefetch_loop_arrays = 1;
9095
9096   aarch64_override_options_after_change_1 (opts);
9097 }
9098
9099 /* Print a hint with a suggestion for a core or architecture name that
9100    most closely resembles what the user passed in STR.  ARCH is true if
9101    the user is asking for an architecture name.  ARCH is false if the user
9102    is asking for a core name.  */
9103
9104 static void
9105 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9106 {
9107   auto_vec<const char *> candidates;
9108   const struct processor *entry = arch ? all_architectures : all_cores;
9109   for (; entry->name != NULL; entry++)
9110     candidates.safe_push (entry->name);
9111   char *s;
9112   const char *hint = candidates_list_and_hint (str, s, candidates);
9113   if (hint)
9114     inform (input_location, "valid arguments are: %s;"
9115                              " did you mean %qs?", s, hint);
9116   XDELETEVEC (s);
9117 }
9118
9119 /* Print a hint with a suggestion for a core name that most closely resembles
9120    what the user passed in STR.  */
9121
9122 inline static void
9123 aarch64_print_hint_for_core (const char *str)
9124 {
9125   aarch64_print_hint_for_core_or_arch (str, false);
9126 }
9127
9128 /* Print a hint with a suggestion for an architecture name that most closely
9129    resembles what the user passed in STR.  */
9130
9131 inline static void
9132 aarch64_print_hint_for_arch (const char *str)
9133 {
9134   aarch64_print_hint_for_core_or_arch (str, true);
9135 }
9136
9137 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9138    specified in STR and throw errors if appropriate.  Put the results if
9139    they are valid in RES and ISA_FLAGS.  Return whether the option is
9140    valid.  */
9141
9142 static bool
9143 aarch64_validate_mcpu (const char *str, const struct processor **res,
9144                        unsigned long *isa_flags)
9145 {
9146   enum aarch64_parse_opt_result parse_res
9147     = aarch64_parse_cpu (str, res, isa_flags);
9148
9149   if (parse_res == AARCH64_PARSE_OK)
9150     return true;
9151
9152   switch (parse_res)
9153     {
9154       case AARCH64_PARSE_MISSING_ARG:
9155         error ("missing cpu name in %<-mcpu=%s%>", str);
9156         break;
9157       case AARCH64_PARSE_INVALID_ARG:
9158         error ("unknown value %qs for -mcpu", str);
9159         aarch64_print_hint_for_core (str);
9160         break;
9161       case AARCH64_PARSE_INVALID_FEATURE:
9162         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9163         break;
9164       default:
9165         gcc_unreachable ();
9166     }
9167
9168   return false;
9169 }
9170
9171 /* Validate a command-line -march option.  Parse the arch and extensions
9172    (if any) specified in STR and throw errors if appropriate.  Put the
9173    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9174    option is valid.  */
9175
9176 static bool
9177 aarch64_validate_march (const char *str, const struct processor **res,
9178                          unsigned long *isa_flags)
9179 {
9180   enum aarch64_parse_opt_result parse_res
9181     = aarch64_parse_arch (str, res, isa_flags);
9182
9183   if (parse_res == AARCH64_PARSE_OK)
9184     return true;
9185
9186   switch (parse_res)
9187     {
9188       case AARCH64_PARSE_MISSING_ARG:
9189         error ("missing arch name in %<-march=%s%>", str);
9190         break;
9191       case AARCH64_PARSE_INVALID_ARG:
9192         error ("unknown value %qs for -march", str);
9193         aarch64_print_hint_for_arch (str);
9194         break;
9195       case AARCH64_PARSE_INVALID_FEATURE:
9196         error ("invalid feature modifier in %<-march=%s%>", str);
9197         break;
9198       default:
9199         gcc_unreachable ();
9200     }
9201
9202   return false;
9203 }
9204
9205 /* Validate a command-line -mtune option.  Parse the cpu
9206    specified in STR and throw errors if appropriate.  Put the
9207    result, if it is valid, in RES.  Return whether the option is
9208    valid.  */
9209
9210 static bool
9211 aarch64_validate_mtune (const char *str, const struct processor **res)
9212 {
9213   enum aarch64_parse_opt_result parse_res
9214     = aarch64_parse_tune (str, res);
9215
9216   if (parse_res == AARCH64_PARSE_OK)
9217     return true;
9218
9219   switch (parse_res)
9220     {
9221       case AARCH64_PARSE_MISSING_ARG:
9222         error ("missing cpu name in %<-mtune=%s%>", str);
9223         break;
9224       case AARCH64_PARSE_INVALID_ARG:
9225         error ("unknown value %qs for -mtune", str);
9226         aarch64_print_hint_for_core (str);
9227         break;
9228       default:
9229         gcc_unreachable ();
9230     }
9231   return false;
9232 }
9233
9234 /* Return the CPU corresponding to the enum CPU.
9235    If it doesn't specify a cpu, return the default.  */
9236
9237 static const struct processor *
9238 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9239 {
9240   if (cpu != aarch64_none)
9241     return &all_cores[cpu];
9242
9243   /* The & 0x3f is to extract the bottom 6 bits that encode the
9244      default cpu as selected by the --with-cpu GCC configure option
9245      in config.gcc.
9246      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9247      flags mechanism should be reworked to make it more sane.  */
9248   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9249 }
9250
9251 /* Return the architecture corresponding to the enum ARCH.
9252    If it doesn't specify a valid architecture, return the default.  */
9253
9254 static const struct processor *
9255 aarch64_get_arch (enum aarch64_arch arch)
9256 {
9257   if (arch != aarch64_no_arch)
9258     return &all_architectures[arch];
9259
9260   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9261
9262   return &all_architectures[cpu->arch];
9263 }
9264
9265 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9266    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9267    tuning structs.  In particular it must set selected_tune and
9268    aarch64_isa_flags that define the available ISA features and tuning
9269    decisions.  It must also set selected_arch as this will be used to
9270    output the .arch asm tags for each function.  */
9271
9272 static void
9273 aarch64_override_options (void)
9274 {
9275   unsigned long cpu_isa = 0;
9276   unsigned long arch_isa = 0;
9277   aarch64_isa_flags = 0;
9278
9279   bool valid_cpu = true;
9280   bool valid_tune = true;
9281   bool valid_arch = true;
9282
9283   selected_cpu = NULL;
9284   selected_arch = NULL;
9285   selected_tune = NULL;
9286
9287   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9288      If either of -march or -mtune is given, they override their
9289      respective component of -mcpu.  */
9290   if (aarch64_cpu_string)
9291     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9292                                         &cpu_isa);
9293
9294   if (aarch64_arch_string)
9295     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9296                                           &arch_isa);
9297
9298   if (aarch64_tune_string)
9299     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9300
9301   /* If the user did not specify a processor, choose the default
9302      one for them.  This will be the CPU set during configuration using
9303      --with-cpu, otherwise it is "generic".  */
9304   if (!selected_cpu)
9305     {
9306       if (selected_arch)
9307         {
9308           selected_cpu = &all_cores[selected_arch->ident];
9309           aarch64_isa_flags = arch_isa;
9310           explicit_arch = selected_arch->arch;
9311         }
9312       else
9313         {
9314           /* Get default configure-time CPU.  */
9315           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9316           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9317         }
9318
9319       if (selected_tune)
9320         explicit_tune_core = selected_tune->ident;
9321     }
9322   /* If both -mcpu and -march are specified check that they are architecturally
9323      compatible, warn if they're not and prefer the -march ISA flags.  */
9324   else if (selected_arch)
9325     {
9326       if (selected_arch->arch != selected_cpu->arch)
9327         {
9328           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9329                        all_architectures[selected_cpu->arch].name,
9330                        selected_arch->name);
9331         }
9332       aarch64_isa_flags = arch_isa;
9333       explicit_arch = selected_arch->arch;
9334       explicit_tune_core = selected_tune ? selected_tune->ident
9335                                           : selected_cpu->ident;
9336     }
9337   else
9338     {
9339       /* -mcpu but no -march.  */
9340       aarch64_isa_flags = cpu_isa;
9341       explicit_tune_core = selected_tune ? selected_tune->ident
9342                                           : selected_cpu->ident;
9343       gcc_assert (selected_cpu);
9344       selected_arch = &all_architectures[selected_cpu->arch];
9345       explicit_arch = selected_arch->arch;
9346     }
9347
9348   /* Set the arch as well as we will need it when outputing
9349      the .arch directive in assembly.  */
9350   if (!selected_arch)
9351     {
9352       gcc_assert (selected_cpu);
9353       selected_arch = &all_architectures[selected_cpu->arch];
9354     }
9355
9356   if (!selected_tune)
9357     selected_tune = selected_cpu;
9358
9359 #ifndef HAVE_AS_MABI_OPTION
9360   /* The compiler may have been configured with 2.23.* binutils, which does
9361      not have support for ILP32.  */
9362   if (TARGET_ILP32)
9363     error ("Assembler does not support -mabi=ilp32");
9364 #endif
9365
9366   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9367     sorry ("Return address signing is only supported for -mabi=lp64");
9368
9369   /* Make sure we properly set up the explicit options.  */
9370   if ((aarch64_cpu_string && valid_cpu)
9371        || (aarch64_tune_string && valid_tune))
9372     gcc_assert (explicit_tune_core != aarch64_none);
9373
9374   if ((aarch64_cpu_string && valid_cpu)
9375        || (aarch64_arch_string && valid_arch))
9376     gcc_assert (explicit_arch != aarch64_no_arch);
9377
9378   aarch64_override_options_internal (&global_options);
9379
9380   /* Save these options as the default ones in case we push and pop them later
9381      while processing functions with potential target attributes.  */
9382   target_option_default_node = target_option_current_node
9383       = build_target_option_node (&global_options);
9384 }
9385
9386 /* Implement targetm.override_options_after_change.  */
9387
9388 static void
9389 aarch64_override_options_after_change (void)
9390 {
9391   aarch64_override_options_after_change_1 (&global_options);
9392 }
9393
9394 static struct machine_function *
9395 aarch64_init_machine_status (void)
9396 {
9397   struct machine_function *machine;
9398   machine = ggc_cleared_alloc<machine_function> ();
9399   return machine;
9400 }
9401
9402 void
9403 aarch64_init_expanders (void)
9404 {
9405   init_machine_status = aarch64_init_machine_status;
9406 }
9407
9408 /* A checking mechanism for the implementation of the various code models.  */
9409 static void
9410 initialize_aarch64_code_model (struct gcc_options *opts)
9411 {
9412    if (opts->x_flag_pic)
9413      {
9414        switch (opts->x_aarch64_cmodel_var)
9415          {
9416          case AARCH64_CMODEL_TINY:
9417            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9418            break;
9419          case AARCH64_CMODEL_SMALL:
9420 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9421            aarch64_cmodel = (flag_pic == 2
9422                              ? AARCH64_CMODEL_SMALL_PIC
9423                              : AARCH64_CMODEL_SMALL_SPIC);
9424 #else
9425            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9426 #endif
9427            break;
9428          case AARCH64_CMODEL_LARGE:
9429            sorry ("code model %qs with -f%s", "large",
9430                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9431            break;
9432          default:
9433            gcc_unreachable ();
9434          }
9435      }
9436    else
9437      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9438 }
9439
9440 /* Implement TARGET_OPTION_SAVE.  */
9441
9442 static void
9443 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9444 {
9445   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9446 }
9447
9448 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9449    using the information saved in PTR.  */
9450
9451 static void
9452 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9453 {
9454   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9455   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9456   opts->x_explicit_arch = ptr->x_explicit_arch;
9457   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9458   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9459
9460   aarch64_override_options_internal (opts);
9461 }
9462
9463 /* Implement TARGET_OPTION_PRINT.  */
9464
9465 static void
9466 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9467 {
9468   const struct processor *cpu
9469     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9470   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9471   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9472   std::string extension
9473     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9474
9475   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9476   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9477            arch->name, extension.c_str ());
9478 }
9479
9480 static GTY(()) tree aarch64_previous_fndecl;
9481
9482 void
9483 aarch64_reset_previous_fndecl (void)
9484 {
9485   aarch64_previous_fndecl = NULL;
9486 }
9487
9488 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9489    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9490    make sure optab availability predicates are recomputed when necessary.  */
9491
9492 void
9493 aarch64_save_restore_target_globals (tree new_tree)
9494 {
9495   if (TREE_TARGET_GLOBALS (new_tree))
9496     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9497   else if (new_tree == target_option_default_node)
9498     restore_target_globals (&default_target_globals);
9499   else
9500     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9501 }
9502
9503 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9504    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9505    of the function, if such exists.  This function may be called multiple
9506    times on a single function so use aarch64_previous_fndecl to avoid
9507    setting up identical state.  */
9508
9509 static void
9510 aarch64_set_current_function (tree fndecl)
9511 {
9512   if (!fndecl || fndecl == aarch64_previous_fndecl)
9513     return;
9514
9515   tree old_tree = (aarch64_previous_fndecl
9516                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9517                    : NULL_TREE);
9518
9519   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9520
9521   /* If current function has no attributes but the previous one did,
9522      use the default node.  */
9523   if (!new_tree && old_tree)
9524     new_tree = target_option_default_node;
9525
9526   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9527      the default have been handled by aarch64_save_restore_target_globals from
9528      aarch64_pragma_target_parse.  */
9529   if (old_tree == new_tree)
9530     return;
9531
9532   aarch64_previous_fndecl = fndecl;
9533
9534   /* First set the target options.  */
9535   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9536
9537   aarch64_save_restore_target_globals (new_tree);
9538 }
9539
9540 /* Enum describing the various ways we can handle attributes.
9541    In many cases we can reuse the generic option handling machinery.  */
9542
9543 enum aarch64_attr_opt_type
9544 {
9545   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9546   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9547   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9548   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9549 };
9550
9551 /* All the information needed to handle a target attribute.
9552    NAME is the name of the attribute.
9553    ATTR_TYPE specifies the type of behavior of the attribute as described
9554    in the definition of enum aarch64_attr_opt_type.
9555    ALLOW_NEG is true if the attribute supports a "no-" form.
9556    HANDLER is the function that takes the attribute string and whether
9557    it is a pragma or attribute and handles the option.  It is needed only
9558    when the ATTR_TYPE is aarch64_attr_custom.
9559    OPT_NUM is the enum specifying the option that the attribute modifies.
9560    This is needed for attributes that mirror the behavior of a command-line
9561    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9562    aarch64_attr_enum.  */
9563
9564 struct aarch64_attribute_info
9565 {
9566   const char *name;
9567   enum aarch64_attr_opt_type attr_type;
9568   bool allow_neg;
9569   bool (*handler) (const char *, const char *);
9570   enum opt_code opt_num;
9571 };
9572
9573 /* Handle the ARCH_STR argument to the arch= target attribute.
9574    PRAGMA_OR_ATTR is used in potential error messages.  */
9575
9576 static bool
9577 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9578 {
9579   const struct processor *tmp_arch = NULL;
9580   enum aarch64_parse_opt_result parse_res
9581     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9582
9583   if (parse_res == AARCH64_PARSE_OK)
9584     {
9585       gcc_assert (tmp_arch);
9586       selected_arch = tmp_arch;
9587       explicit_arch = selected_arch->arch;
9588       return true;
9589     }
9590
9591   switch (parse_res)
9592     {
9593       case AARCH64_PARSE_MISSING_ARG:
9594         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9595         break;
9596       case AARCH64_PARSE_INVALID_ARG:
9597         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9598         aarch64_print_hint_for_arch (str);
9599         break;
9600       case AARCH64_PARSE_INVALID_FEATURE:
9601         error ("invalid feature modifier %qs for 'arch' target %s",
9602                str, pragma_or_attr);
9603         break;
9604       default:
9605         gcc_unreachable ();
9606     }
9607
9608   return false;
9609 }
9610
9611 /* Handle the argument CPU_STR to the cpu= target attribute.
9612    PRAGMA_OR_ATTR is used in potential error messages.  */
9613
9614 static bool
9615 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9616 {
9617   const struct processor *tmp_cpu = NULL;
9618   enum aarch64_parse_opt_result parse_res
9619     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9620
9621   if (parse_res == AARCH64_PARSE_OK)
9622     {
9623       gcc_assert (tmp_cpu);
9624       selected_tune = tmp_cpu;
9625       explicit_tune_core = selected_tune->ident;
9626
9627       selected_arch = &all_architectures[tmp_cpu->arch];
9628       explicit_arch = selected_arch->arch;
9629       return true;
9630     }
9631
9632   switch (parse_res)
9633     {
9634       case AARCH64_PARSE_MISSING_ARG:
9635         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9636         break;
9637       case AARCH64_PARSE_INVALID_ARG:
9638         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9639         aarch64_print_hint_for_core (str);
9640         break;
9641       case AARCH64_PARSE_INVALID_FEATURE:
9642         error ("invalid feature modifier %qs for 'cpu' target %s",
9643                str, pragma_or_attr);
9644         break;
9645       default:
9646         gcc_unreachable ();
9647     }
9648
9649   return false;
9650 }
9651
9652 /* Handle the argument STR to the tune= target attribute.
9653    PRAGMA_OR_ATTR is used in potential error messages.  */
9654
9655 static bool
9656 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9657 {
9658   const struct processor *tmp_tune = NULL;
9659   enum aarch64_parse_opt_result parse_res
9660     = aarch64_parse_tune (str, &tmp_tune);
9661
9662   if (parse_res == AARCH64_PARSE_OK)
9663     {
9664       gcc_assert (tmp_tune);
9665       selected_tune = tmp_tune;
9666       explicit_tune_core = selected_tune->ident;
9667       return true;
9668     }
9669
9670   switch (parse_res)
9671     {
9672       case AARCH64_PARSE_INVALID_ARG:
9673         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9674         aarch64_print_hint_for_core (str);
9675         break;
9676       default:
9677         gcc_unreachable ();
9678     }
9679
9680   return false;
9681 }
9682
9683 /* Parse an architecture extensions target attribute string specified in STR.
9684    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9685    if successful.  Update aarch64_isa_flags to reflect the ISA features
9686    modified.
9687    PRAGMA_OR_ATTR is used in potential error messages.  */
9688
9689 static bool
9690 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9691 {
9692   enum aarch64_parse_opt_result parse_res;
9693   unsigned long isa_flags = aarch64_isa_flags;
9694
9695   /* We allow "+nothing" in the beginning to clear out all architectural
9696      features if the user wants to handpick specific features.  */
9697   if (strncmp ("+nothing", str, 8) == 0)
9698     {
9699       isa_flags = 0;
9700       str += 8;
9701     }
9702
9703   parse_res = aarch64_parse_extension (str, &isa_flags);
9704
9705   if (parse_res == AARCH64_PARSE_OK)
9706     {
9707       aarch64_isa_flags = isa_flags;
9708       return true;
9709     }
9710
9711   switch (parse_res)
9712     {
9713       case AARCH64_PARSE_MISSING_ARG:
9714         error ("missing feature modifier in target %s %qs",
9715                pragma_or_attr, str);
9716         break;
9717
9718       case AARCH64_PARSE_INVALID_FEATURE:
9719         error ("invalid feature modifier in target %s %qs",
9720                pragma_or_attr, str);
9721         break;
9722
9723       default:
9724         gcc_unreachable ();
9725     }
9726
9727  return false;
9728 }
9729
9730 /* The target attributes that we support.  On top of these we also support just
9731    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9732    handled explicitly in aarch64_process_one_target_attr.  */
9733
9734 static const struct aarch64_attribute_info aarch64_attributes[] =
9735 {
9736   { "general-regs-only", aarch64_attr_mask, false, NULL,
9737      OPT_mgeneral_regs_only },
9738   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9739      OPT_mfix_cortex_a53_835769 },
9740   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9741      OPT_mfix_cortex_a53_843419 },
9742   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9743   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9744   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9745      OPT_momit_leaf_frame_pointer },
9746   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9747   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9748      OPT_march_ },
9749   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9750   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9751      OPT_mtune_ },
9752   { "sign-return-address", aarch64_attr_enum, false, NULL,
9753      OPT_msign_return_address_ },
9754   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9755 };
9756
9757 /* Parse ARG_STR which contains the definition of one target attribute.
9758    Show appropriate errors if any or return true if the attribute is valid.
9759    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9760    we're processing a target attribute or pragma.  */
9761
9762 static bool
9763 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9764 {
9765   bool invert = false;
9766
9767   size_t len = strlen (arg_str);
9768
9769   if (len == 0)
9770     {
9771       error ("malformed target %s", pragma_or_attr);
9772       return false;
9773     }
9774
9775   char *str_to_check = (char *) alloca (len + 1);
9776   strcpy (str_to_check, arg_str);
9777
9778   /* Skip leading whitespace.  */
9779   while (*str_to_check == ' ' || *str_to_check == '\t')
9780     str_to_check++;
9781
9782   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9783      It is easier to detect and handle it explicitly here rather than going
9784      through the machinery for the rest of the target attributes in this
9785      function.  */
9786   if (*str_to_check == '+')
9787     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9788
9789   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9790     {
9791       invert = true;
9792       str_to_check += 3;
9793     }
9794   char *arg = strchr (str_to_check, '=');
9795
9796   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9797      and point ARG to "foo".  */
9798   if (arg)
9799     {
9800       *arg = '\0';
9801       arg++;
9802     }
9803   const struct aarch64_attribute_info *p_attr;
9804   bool found = false;
9805   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9806     {
9807       /* If the names don't match up, or the user has given an argument
9808          to an attribute that doesn't accept one, or didn't give an argument
9809          to an attribute that expects one, fail to match.  */
9810       if (strcmp (str_to_check, p_attr->name) != 0)
9811         continue;
9812
9813       found = true;
9814       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9815                               || p_attr->attr_type == aarch64_attr_enum;
9816
9817       if (attr_need_arg_p ^ (arg != NULL))
9818         {
9819           error ("target %s %qs does not accept an argument",
9820                   pragma_or_attr, str_to_check);
9821           return false;
9822         }
9823
9824       /* If the name matches but the attribute does not allow "no-" versions
9825          then we can't match.  */
9826       if (invert && !p_attr->allow_neg)
9827         {
9828           error ("target %s %qs does not allow a negated form",
9829                   pragma_or_attr, str_to_check);
9830           return false;
9831         }
9832
9833       switch (p_attr->attr_type)
9834         {
9835         /* Has a custom handler registered.
9836            For example, cpu=, arch=, tune=.  */
9837           case aarch64_attr_custom:
9838             gcc_assert (p_attr->handler);
9839             if (!p_attr->handler (arg, pragma_or_attr))
9840               return false;
9841             break;
9842
9843           /* Either set or unset a boolean option.  */
9844           case aarch64_attr_bool:
9845             {
9846               struct cl_decoded_option decoded;
9847
9848               generate_option (p_attr->opt_num, NULL, !invert,
9849                                CL_TARGET, &decoded);
9850               aarch64_handle_option (&global_options, &global_options_set,
9851                                       &decoded, input_location);
9852               break;
9853             }
9854           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9855              should know what mask to apply given the option number.  */
9856           case aarch64_attr_mask:
9857             {
9858               struct cl_decoded_option decoded;
9859               /* We only need to specify the option number.
9860                  aarch64_handle_option will know which mask to apply.  */
9861               decoded.opt_index = p_attr->opt_num;
9862               decoded.value = !invert;
9863               aarch64_handle_option (&global_options, &global_options_set,
9864                                       &decoded, input_location);
9865               break;
9866             }
9867           /* Use the option setting machinery to set an option to an enum.  */
9868           case aarch64_attr_enum:
9869             {
9870               gcc_assert (arg);
9871               bool valid;
9872               int value;
9873               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9874                                               &value, CL_TARGET);
9875               if (valid)
9876                 {
9877                   set_option (&global_options, NULL, p_attr->opt_num, value,
9878                               NULL, DK_UNSPECIFIED, input_location,
9879                               global_dc);
9880                 }
9881               else
9882                 {
9883                   error ("target %s %s=%s is not valid",
9884                          pragma_or_attr, str_to_check, arg);
9885                 }
9886               break;
9887             }
9888           default:
9889             gcc_unreachable ();
9890         }
9891     }
9892
9893   /* If we reached here we either have found an attribute and validated
9894      it or didn't match any.  If we matched an attribute but its arguments
9895      were malformed we will have returned false already.  */
9896   return found;
9897 }
9898
9899 /* Count how many times the character C appears in
9900    NULL-terminated string STR.  */
9901
9902 static unsigned int
9903 num_occurences_in_str (char c, char *str)
9904 {
9905   unsigned int res = 0;
9906   while (*str != '\0')
9907     {
9908       if (*str == c)
9909         res++;
9910
9911       str++;
9912     }
9913
9914   return res;
9915 }
9916
9917 /* Parse the tree in ARGS that contains the target attribute information
9918    and update the global target options space.  PRAGMA_OR_ATTR is a string
9919    to be used in error messages, specifying whether this is processing
9920    a target attribute or a target pragma.  */
9921
9922 bool
9923 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9924 {
9925   if (TREE_CODE (args) == TREE_LIST)
9926     {
9927       do
9928         {
9929           tree head = TREE_VALUE (args);
9930           if (head)
9931             {
9932               if (!aarch64_process_target_attr (head, pragma_or_attr))
9933                 return false;
9934             }
9935           args = TREE_CHAIN (args);
9936         } while (args);
9937
9938       return true;
9939     }
9940
9941   if (TREE_CODE (args) != STRING_CST)
9942     {
9943       error ("attribute %<target%> argument not a string");
9944       return false;
9945     }
9946
9947   size_t len = strlen (TREE_STRING_POINTER (args));
9948   char *str_to_check = (char *) alloca (len + 1);
9949   strcpy (str_to_check, TREE_STRING_POINTER (args));
9950
9951   if (len == 0)
9952     {
9953       error ("malformed target %s value", pragma_or_attr);
9954       return false;
9955     }
9956
9957   /* Used to catch empty spaces between commas i.e.
9958      attribute ((target ("attr1,,attr2"))).  */
9959   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9960
9961   /* Handle multiple target attributes separated by ','.  */
9962   char *token = strtok (str_to_check, ",");
9963
9964   unsigned int num_attrs = 0;
9965   while (token)
9966     {
9967       num_attrs++;
9968       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9969         {
9970           error ("target %s %qs is invalid", pragma_or_attr, token);
9971           return false;
9972         }
9973
9974       token = strtok (NULL, ",");
9975     }
9976
9977   if (num_attrs != num_commas + 1)
9978     {
9979       error ("malformed target %s list %qs",
9980               pragma_or_attr, TREE_STRING_POINTER (args));
9981       return false;
9982     }
9983
9984   return true;
9985 }
9986
9987 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9988    process attribute ((target ("..."))).  */
9989
9990 static bool
9991 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9992 {
9993   struct cl_target_option cur_target;
9994   bool ret;
9995   tree old_optimize;
9996   tree new_target, new_optimize;
9997   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9998
9999   /* If what we're processing is the current pragma string then the
10000      target option node is already stored in target_option_current_node
10001      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
10002      having to re-parse the string.  This is especially useful to keep
10003      arm_neon.h compile times down since that header contains a lot
10004      of intrinsics enclosed in pragmas.  */
10005   if (!existing_target && args == current_target_pragma)
10006     {
10007       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
10008       return true;
10009     }
10010   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10011
10012   old_optimize = build_optimization_node (&global_options);
10013   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10014
10015   /* If the function changed the optimization levels as well as setting
10016      target options, start with the optimizations specified.  */
10017   if (func_optimize && func_optimize != old_optimize)
10018     cl_optimization_restore (&global_options,
10019                              TREE_OPTIMIZATION (func_optimize));
10020
10021   /* Save the current target options to restore at the end.  */
10022   cl_target_option_save (&cur_target, &global_options);
10023
10024   /* If fndecl already has some target attributes applied to it, unpack
10025      them so that we add this attribute on top of them, rather than
10026      overwriting them.  */
10027   if (existing_target)
10028     {
10029       struct cl_target_option *existing_options
10030         = TREE_TARGET_OPTION (existing_target);
10031
10032       if (existing_options)
10033         cl_target_option_restore (&global_options, existing_options);
10034     }
10035   else
10036     cl_target_option_restore (&global_options,
10037                         TREE_TARGET_OPTION (target_option_current_node));
10038
10039
10040   ret = aarch64_process_target_attr (args, "attribute");
10041
10042   /* Set up any additional state.  */
10043   if (ret)
10044     {
10045       aarch64_override_options_internal (&global_options);
10046       /* Initialize SIMD builtins if we haven't already.
10047          Set current_target_pragma to NULL for the duration so that
10048          the builtin initialization code doesn't try to tag the functions
10049          being built with the attributes specified by any current pragma, thus
10050          going into an infinite recursion.  */
10051       if (TARGET_SIMD)
10052         {
10053           tree saved_current_target_pragma = current_target_pragma;
10054           current_target_pragma = NULL;
10055           aarch64_init_simd_builtins ();
10056           current_target_pragma = saved_current_target_pragma;
10057         }
10058       new_target = build_target_option_node (&global_options);
10059     }
10060   else
10061     new_target = NULL;
10062
10063   new_optimize = build_optimization_node (&global_options);
10064
10065   if (fndecl && ret)
10066     {
10067       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10068
10069       if (old_optimize != new_optimize)
10070         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10071     }
10072
10073   cl_target_option_restore (&global_options, &cur_target);
10074
10075   if (old_optimize != new_optimize)
10076     cl_optimization_restore (&global_options,
10077                              TREE_OPTIMIZATION (old_optimize));
10078   return ret;
10079 }
10080
10081 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10082    tri-bool options (yes, no, don't care) and the default value is
10083    DEF, determine whether to reject inlining.  */
10084
10085 static bool
10086 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10087                                      int dont_care, int def)
10088 {
10089   /* If the callee doesn't care, always allow inlining.  */
10090   if (callee == dont_care)
10091     return true;
10092
10093   /* If the caller doesn't care, always allow inlining.  */
10094   if (caller == dont_care)
10095     return true;
10096
10097   /* Otherwise, allow inlining if either the callee and caller values
10098      agree, or if the callee is using the default value.  */
10099   return (callee == caller || callee == def);
10100 }
10101
10102 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10103    to inline CALLEE into CALLER based on target-specific info.
10104    Make sure that the caller and callee have compatible architectural
10105    features.  Then go through the other possible target attributes
10106    and see if they can block inlining.  Try not to reject always_inline
10107    callees unless they are incompatible architecturally.  */
10108
10109 static bool
10110 aarch64_can_inline_p (tree caller, tree callee)
10111 {
10112   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10113   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10114
10115   /* If callee has no option attributes, then it is ok to inline.  */
10116   if (!callee_tree)
10117     return true;
10118
10119   struct cl_target_option *caller_opts
10120         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10121                                            : target_option_default_node);
10122
10123   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10124
10125
10126   /* Callee's ISA flags should be a subset of the caller's.  */
10127   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10128        != callee_opts->x_aarch64_isa_flags)
10129     return false;
10130
10131   /* Allow non-strict aligned functions inlining into strict
10132      aligned ones.  */
10133   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10134        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10135       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10136            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10137     return false;
10138
10139   bool always_inline = lookup_attribute ("always_inline",
10140                                           DECL_ATTRIBUTES (callee));
10141
10142   /* If the architectural features match up and the callee is always_inline
10143      then the other attributes don't matter.  */
10144   if (always_inline)
10145     return true;
10146
10147   if (caller_opts->x_aarch64_cmodel_var
10148       != callee_opts->x_aarch64_cmodel_var)
10149     return false;
10150
10151   if (caller_opts->x_aarch64_tls_dialect
10152       != callee_opts->x_aarch64_tls_dialect)
10153     return false;
10154
10155   /* Honour explicit requests to workaround errata.  */
10156   if (!aarch64_tribools_ok_for_inlining_p (
10157           caller_opts->x_aarch64_fix_a53_err835769,
10158           callee_opts->x_aarch64_fix_a53_err835769,
10159           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10160     return false;
10161
10162   if (!aarch64_tribools_ok_for_inlining_p (
10163           caller_opts->x_aarch64_fix_a53_err843419,
10164           callee_opts->x_aarch64_fix_a53_err843419,
10165           2, TARGET_FIX_ERR_A53_843419))
10166     return false;
10167
10168   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10169      caller and calle and they don't match up, reject inlining.  */
10170   if (!aarch64_tribools_ok_for_inlining_p (
10171           caller_opts->x_flag_omit_leaf_frame_pointer,
10172           callee_opts->x_flag_omit_leaf_frame_pointer,
10173           2, 1))
10174     return false;
10175
10176   /* If the callee has specific tuning overrides, respect them.  */
10177   if (callee_opts->x_aarch64_override_tune_string != NULL
10178       && caller_opts->x_aarch64_override_tune_string == NULL)
10179     return false;
10180
10181   /* If the user specified tuning override strings for the
10182      caller and callee and they don't match up, reject inlining.
10183      We just do a string compare here, we don't analyze the meaning
10184      of the string, as it would be too costly for little gain.  */
10185   if (callee_opts->x_aarch64_override_tune_string
10186       && caller_opts->x_aarch64_override_tune_string
10187       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10188                   caller_opts->x_aarch64_override_tune_string) != 0))
10189     return false;
10190
10191   return true;
10192 }
10193
10194 /* Return true if SYMBOL_REF X binds locally.  */
10195
10196 static bool
10197 aarch64_symbol_binds_local_p (const_rtx x)
10198 {
10199   return (SYMBOL_REF_DECL (x)
10200           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10201           : SYMBOL_REF_LOCAL_P (x));
10202 }
10203
10204 /* Return true if SYMBOL_REF X is thread local */
10205 static bool
10206 aarch64_tls_symbol_p (rtx x)
10207 {
10208   if (! TARGET_HAVE_TLS)
10209     return false;
10210
10211   if (GET_CODE (x) != SYMBOL_REF)
10212     return false;
10213
10214   return SYMBOL_REF_TLS_MODEL (x) != 0;
10215 }
10216
10217 /* Classify a TLS symbol into one of the TLS kinds.  */
10218 enum aarch64_symbol_type
10219 aarch64_classify_tls_symbol (rtx x)
10220 {
10221   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10222
10223   switch (tls_kind)
10224     {
10225     case TLS_MODEL_GLOBAL_DYNAMIC:
10226     case TLS_MODEL_LOCAL_DYNAMIC:
10227       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10228
10229     case TLS_MODEL_INITIAL_EXEC:
10230       switch (aarch64_cmodel)
10231         {
10232         case AARCH64_CMODEL_TINY:
10233         case AARCH64_CMODEL_TINY_PIC:
10234           return SYMBOL_TINY_TLSIE;
10235         default:
10236           return SYMBOL_SMALL_TLSIE;
10237         }
10238
10239     case TLS_MODEL_LOCAL_EXEC:
10240       if (aarch64_tls_size == 12)
10241         return SYMBOL_TLSLE12;
10242       else if (aarch64_tls_size == 24)
10243         return SYMBOL_TLSLE24;
10244       else if (aarch64_tls_size == 32)
10245         return SYMBOL_TLSLE32;
10246       else if (aarch64_tls_size == 48)
10247         return SYMBOL_TLSLE48;
10248       else
10249         gcc_unreachable ();
10250
10251     case TLS_MODEL_EMULATED:
10252     case TLS_MODEL_NONE:
10253       return SYMBOL_FORCE_TO_MEM;
10254
10255     default:
10256       gcc_unreachable ();
10257     }
10258 }
10259
10260 /* Return the method that should be used to access SYMBOL_REF or
10261    LABEL_REF X.  */
10262
10263 enum aarch64_symbol_type
10264 aarch64_classify_symbol (rtx x, rtx offset)
10265 {
10266   if (GET_CODE (x) == LABEL_REF)
10267     {
10268       switch (aarch64_cmodel)
10269         {
10270         case AARCH64_CMODEL_LARGE:
10271           return SYMBOL_FORCE_TO_MEM;
10272
10273         case AARCH64_CMODEL_TINY_PIC:
10274         case AARCH64_CMODEL_TINY:
10275           return SYMBOL_TINY_ABSOLUTE;
10276
10277         case AARCH64_CMODEL_SMALL_SPIC:
10278         case AARCH64_CMODEL_SMALL_PIC:
10279         case AARCH64_CMODEL_SMALL:
10280           return SYMBOL_SMALL_ABSOLUTE;
10281
10282         default:
10283           gcc_unreachable ();
10284         }
10285     }
10286
10287   if (GET_CODE (x) == SYMBOL_REF)
10288     {
10289       if (aarch64_tls_symbol_p (x))
10290         return aarch64_classify_tls_symbol (x);
10291
10292       switch (aarch64_cmodel)
10293         {
10294         case AARCH64_CMODEL_TINY:
10295           /* When we retrieve symbol + offset address, we have to make sure
10296              the offset does not cause overflow of the final address.  But
10297              we have no way of knowing the address of symbol at compile time
10298              so we can't accurately say if the distance between the PC and
10299              symbol + offset is outside the addressible range of +/-1M in the
10300              TINY code model.  So we rely on images not being greater than
10301              1M and cap the offset at 1M and anything beyond 1M will have to
10302              be loaded using an alternative mechanism.  Furthermore if the
10303              symbol is a weak reference to something that isn't known to
10304              resolve to a symbol in this module, then force to memory.  */
10305           if ((SYMBOL_REF_WEAK (x)
10306                && !aarch64_symbol_binds_local_p (x))
10307               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10308             return SYMBOL_FORCE_TO_MEM;
10309           return SYMBOL_TINY_ABSOLUTE;
10310
10311         case AARCH64_CMODEL_SMALL:
10312           /* Same reasoning as the tiny code model, but the offset cap here is
10313              4G.  */
10314           if ((SYMBOL_REF_WEAK (x)
10315                && !aarch64_symbol_binds_local_p (x))
10316               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10317                             HOST_WIDE_INT_C (4294967264)))
10318             return SYMBOL_FORCE_TO_MEM;
10319           return SYMBOL_SMALL_ABSOLUTE;
10320
10321         case AARCH64_CMODEL_TINY_PIC:
10322           if (!aarch64_symbol_binds_local_p (x))
10323             return SYMBOL_TINY_GOT;
10324           return SYMBOL_TINY_ABSOLUTE;
10325
10326         case AARCH64_CMODEL_SMALL_SPIC:
10327         case AARCH64_CMODEL_SMALL_PIC:
10328           if (!aarch64_symbol_binds_local_p (x))
10329             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10330                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10331           return SYMBOL_SMALL_ABSOLUTE;
10332
10333         case AARCH64_CMODEL_LARGE:
10334           /* This is alright even in PIC code as the constant
10335              pool reference is always PC relative and within
10336              the same translation unit.  */
10337           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10338             return SYMBOL_SMALL_ABSOLUTE;
10339           else
10340             return SYMBOL_FORCE_TO_MEM;
10341
10342         default:
10343           gcc_unreachable ();
10344         }
10345     }
10346
10347   /* By default push everything into the constant pool.  */
10348   return SYMBOL_FORCE_TO_MEM;
10349 }
10350
10351 bool
10352 aarch64_constant_address_p (rtx x)
10353 {
10354   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10355 }
10356
10357 bool
10358 aarch64_legitimate_pic_operand_p (rtx x)
10359 {
10360   if (GET_CODE (x) == SYMBOL_REF
10361       || (GET_CODE (x) == CONST
10362           && GET_CODE (XEXP (x, 0)) == PLUS
10363           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10364      return false;
10365
10366   return true;
10367 }
10368
10369 /* Return true if X holds either a quarter-precision or
10370      floating-point +0.0 constant.  */
10371 static bool
10372 aarch64_valid_floating_const (rtx x)
10373 {
10374   if (!CONST_DOUBLE_P (x))
10375     return false;
10376
10377   /* This call determines which constants can be used in mov<mode>
10378      as integer moves instead of constant loads.  */
10379   if (aarch64_float_const_rtx_p (x))
10380     return true;
10381
10382   return aarch64_float_const_representable_p (x);
10383 }
10384
10385 static bool
10386 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10387 {
10388   /* Do not allow vector struct mode constants.  We could support
10389      0 and -1 easily, but they need support in aarch64-simd.md.  */
10390   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10391     return false;
10392
10393   /* For these cases we never want to use a literal load.
10394      As such we have to prevent the compiler from forcing these
10395      to memory.  */
10396   if ((GET_CODE (x) == CONST_VECTOR
10397        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10398       || CONST_INT_P (x)
10399       || aarch64_valid_floating_const (x)
10400       || aarch64_can_const_movi_rtx_p (x, mode)
10401       || aarch64_float_const_rtx_p (x))
10402         return !targetm.cannot_force_const_mem (mode, x);
10403
10404   if (GET_CODE (x) == HIGH
10405       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10406     return true;
10407
10408   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10409      so spilling them is better than rematerialization.  */
10410   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10411     return true;
10412
10413   return aarch64_constant_address_p (x);
10414 }
10415
10416 rtx
10417 aarch64_load_tp (rtx target)
10418 {
10419   if (!target
10420       || GET_MODE (target) != Pmode
10421       || !register_operand (target, Pmode))
10422     target = gen_reg_rtx (Pmode);
10423
10424   /* Can return in any reg.  */
10425   emit_insn (gen_aarch64_load_tp_hard (target));
10426   return target;
10427 }
10428
10429 /* On AAPCS systems, this is the "struct __va_list".  */
10430 static GTY(()) tree va_list_type;
10431
10432 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10433    Return the type to use as __builtin_va_list.
10434
10435    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10436
10437    struct __va_list
10438    {
10439      void *__stack;
10440      void *__gr_top;
10441      void *__vr_top;
10442      int   __gr_offs;
10443      int   __vr_offs;
10444    };  */
10445
10446 static tree
10447 aarch64_build_builtin_va_list (void)
10448 {
10449   tree va_list_name;
10450   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10451
10452   /* Create the type.  */
10453   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10454   /* Give it the required name.  */
10455   va_list_name = build_decl (BUILTINS_LOCATION,
10456                              TYPE_DECL,
10457                              get_identifier ("__va_list"),
10458                              va_list_type);
10459   DECL_ARTIFICIAL (va_list_name) = 1;
10460   TYPE_NAME (va_list_type) = va_list_name;
10461   TYPE_STUB_DECL (va_list_type) = va_list_name;
10462
10463   /* Create the fields.  */
10464   f_stack = build_decl (BUILTINS_LOCATION,
10465                         FIELD_DECL, get_identifier ("__stack"),
10466                         ptr_type_node);
10467   f_grtop = build_decl (BUILTINS_LOCATION,
10468                         FIELD_DECL, get_identifier ("__gr_top"),
10469                         ptr_type_node);
10470   f_vrtop = build_decl (BUILTINS_LOCATION,
10471                         FIELD_DECL, get_identifier ("__vr_top"),
10472                         ptr_type_node);
10473   f_groff = build_decl (BUILTINS_LOCATION,
10474                         FIELD_DECL, get_identifier ("__gr_offs"),
10475                         integer_type_node);
10476   f_vroff = build_decl (BUILTINS_LOCATION,
10477                         FIELD_DECL, get_identifier ("__vr_offs"),
10478                         integer_type_node);
10479
10480   /* Tell tree-stdarg pass about our internal offset fields.
10481      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10482      purpose to identify whether the code is updating va_list internal
10483      offset fields through irregular way.  */
10484   va_list_gpr_counter_field = f_groff;
10485   va_list_fpr_counter_field = f_vroff;
10486
10487   DECL_ARTIFICIAL (f_stack) = 1;
10488   DECL_ARTIFICIAL (f_grtop) = 1;
10489   DECL_ARTIFICIAL (f_vrtop) = 1;
10490   DECL_ARTIFICIAL (f_groff) = 1;
10491   DECL_ARTIFICIAL (f_vroff) = 1;
10492
10493   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10494   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10495   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10496   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10497   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10498
10499   TYPE_FIELDS (va_list_type) = f_stack;
10500   DECL_CHAIN (f_stack) = f_grtop;
10501   DECL_CHAIN (f_grtop) = f_vrtop;
10502   DECL_CHAIN (f_vrtop) = f_groff;
10503   DECL_CHAIN (f_groff) = f_vroff;
10504
10505   /* Compute its layout.  */
10506   layout_type (va_list_type);
10507
10508   return va_list_type;
10509 }
10510
10511 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10512 static void
10513 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10514 {
10515   const CUMULATIVE_ARGS *cum;
10516   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10517   tree stack, grtop, vrtop, groff, vroff;
10518   tree t;
10519   int gr_save_area_size = cfun->va_list_gpr_size;
10520   int vr_save_area_size = cfun->va_list_fpr_size;
10521   int vr_offset;
10522
10523   cum = &crtl->args.info;
10524   if (cfun->va_list_gpr_size)
10525     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10526                              cfun->va_list_gpr_size);
10527   if (cfun->va_list_fpr_size)
10528     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10529                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10530
10531   if (!TARGET_FLOAT)
10532     {
10533       gcc_assert (cum->aapcs_nvrn == 0);
10534       vr_save_area_size = 0;
10535     }
10536
10537   f_stack = TYPE_FIELDS (va_list_type_node);
10538   f_grtop = DECL_CHAIN (f_stack);
10539   f_vrtop = DECL_CHAIN (f_grtop);
10540   f_groff = DECL_CHAIN (f_vrtop);
10541   f_vroff = DECL_CHAIN (f_groff);
10542
10543   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10544                   NULL_TREE);
10545   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10546                   NULL_TREE);
10547   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10548                   NULL_TREE);
10549   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10550                   NULL_TREE);
10551   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10552                   NULL_TREE);
10553
10554   /* Emit code to initialize STACK, which points to the next varargs stack
10555      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10556      by named arguments.  STACK is 8-byte aligned.  */
10557   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10558   if (cum->aapcs_stack_size > 0)
10559     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10560   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10561   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10562
10563   /* Emit code to initialize GRTOP, the top of the GR save area.
10564      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10565   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10566   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10567   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10568
10569   /* Emit code to initialize VRTOP, the top of the VR save area.
10570      This address is gr_save_area_bytes below GRTOP, rounded
10571      down to the next 16-byte boundary.  */
10572   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10573   vr_offset = ROUND_UP (gr_save_area_size,
10574                         STACK_BOUNDARY / BITS_PER_UNIT);
10575
10576   if (vr_offset)
10577     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10578   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10579   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10580
10581   /* Emit code to initialize GROFF, the offset from GRTOP of the
10582      next GPR argument.  */
10583   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10584               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10585   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10586
10587   /* Likewise emit code to initialize VROFF, the offset from FTOP
10588      of the next VR argument.  */
10589   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10590               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10591   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10592 }
10593
10594 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10595
10596 static tree
10597 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10598                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10599 {
10600   tree addr;
10601   bool indirect_p;
10602   bool is_ha;           /* is HFA or HVA.  */
10603   bool dw_align;        /* double-word align.  */
10604   machine_mode ag_mode = VOIDmode;
10605   int nregs;
10606   machine_mode mode;
10607
10608   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10609   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10610   HOST_WIDE_INT size, rsize, adjust, align;
10611   tree t, u, cond1, cond2;
10612
10613   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10614   if (indirect_p)
10615     type = build_pointer_type (type);
10616
10617   mode = TYPE_MODE (type);
10618
10619   f_stack = TYPE_FIELDS (va_list_type_node);
10620   f_grtop = DECL_CHAIN (f_stack);
10621   f_vrtop = DECL_CHAIN (f_grtop);
10622   f_groff = DECL_CHAIN (f_vrtop);
10623   f_vroff = DECL_CHAIN (f_groff);
10624
10625   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10626                   f_stack, NULL_TREE);
10627   size = int_size_in_bytes (type);
10628   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10629
10630   dw_align = false;
10631   adjust = 0;
10632   if (aarch64_vfp_is_call_or_return_candidate (mode,
10633                                                type,
10634                                                &ag_mode,
10635                                                &nregs,
10636                                                &is_ha))
10637     {
10638       /* TYPE passed in fp/simd registers.  */
10639       if (!TARGET_FLOAT)
10640         aarch64_err_no_fpadvsimd (mode, "varargs");
10641
10642       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10643                       unshare_expr (valist), f_vrtop, NULL_TREE);
10644       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10645                       unshare_expr (valist), f_vroff, NULL_TREE);
10646
10647       rsize = nregs * UNITS_PER_VREG;
10648
10649       if (is_ha)
10650         {
10651           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10652             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10653         }
10654       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10655                && size < UNITS_PER_VREG)
10656         {
10657           adjust = UNITS_PER_VREG - size;
10658         }
10659     }
10660   else
10661     {
10662       /* TYPE passed in general registers.  */
10663       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10664                       unshare_expr (valist), f_grtop, NULL_TREE);
10665       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10666                       unshare_expr (valist), f_groff, NULL_TREE);
10667       rsize = ROUND_UP (size, UNITS_PER_WORD);
10668       nregs = rsize / UNITS_PER_WORD;
10669
10670       if (align > 8)
10671         dw_align = true;
10672
10673       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10674           && size < UNITS_PER_WORD)
10675         {
10676           adjust = UNITS_PER_WORD  - size;
10677         }
10678     }
10679
10680   /* Get a local temporary for the field value.  */
10681   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10682
10683   /* Emit code to branch if off >= 0.  */
10684   t = build2 (GE_EXPR, boolean_type_node, off,
10685               build_int_cst (TREE_TYPE (off), 0));
10686   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10687
10688   if (dw_align)
10689     {
10690       /* Emit: offs = (offs + 15) & -16.  */
10691       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10692                   build_int_cst (TREE_TYPE (off), 15));
10693       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10694                   build_int_cst (TREE_TYPE (off), -16));
10695       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10696     }
10697   else
10698     roundup = NULL;
10699
10700   /* Update ap.__[g|v]r_offs  */
10701   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10702               build_int_cst (TREE_TYPE (off), rsize));
10703   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10704
10705   /* String up.  */
10706   if (roundup)
10707     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10708
10709   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10710   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10711               build_int_cst (TREE_TYPE (f_off), 0));
10712   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10713
10714   /* String up: make sure the assignment happens before the use.  */
10715   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10716   COND_EXPR_ELSE (cond1) = t;
10717
10718   /* Prepare the trees handling the argument that is passed on the stack;
10719      the top level node will store in ON_STACK.  */
10720   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10721   if (align > 8)
10722     {
10723       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10724       t = fold_convert (intDI_type_node, arg);
10725       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10726                   build_int_cst (TREE_TYPE (t), 15));
10727       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10728                   build_int_cst (TREE_TYPE (t), -16));
10729       t = fold_convert (TREE_TYPE (arg), t);
10730       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10731     }
10732   else
10733     roundup = NULL;
10734   /* Advance ap.__stack  */
10735   t = fold_convert (intDI_type_node, arg);
10736   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10737               build_int_cst (TREE_TYPE (t), size + 7));
10738   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10739               build_int_cst (TREE_TYPE (t), -8));
10740   t = fold_convert (TREE_TYPE (arg), t);
10741   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10742   /* String up roundup and advance.  */
10743   if (roundup)
10744     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10745   /* String up with arg */
10746   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10747   /* Big-endianness related address adjustment.  */
10748   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10749       && size < UNITS_PER_WORD)
10750   {
10751     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10752                 size_int (UNITS_PER_WORD - size));
10753     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10754   }
10755
10756   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10757   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10758
10759   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10760   t = off;
10761   if (adjust)
10762     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10763                 build_int_cst (TREE_TYPE (off), adjust));
10764
10765   t = fold_convert (sizetype, t);
10766   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10767
10768   if (is_ha)
10769     {
10770       /* type ha; // treat as "struct {ftype field[n];}"
10771          ... [computing offs]
10772          for (i = 0; i <nregs; ++i, offs += 16)
10773            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10774          return ha;  */
10775       int i;
10776       tree tmp_ha, field_t, field_ptr_t;
10777
10778       /* Declare a local variable.  */
10779       tmp_ha = create_tmp_var_raw (type, "ha");
10780       gimple_add_tmp_var (tmp_ha);
10781
10782       /* Establish the base type.  */
10783       switch (ag_mode)
10784         {
10785         case E_SFmode:
10786           field_t = float_type_node;
10787           field_ptr_t = float_ptr_type_node;
10788           break;
10789         case E_DFmode:
10790           field_t = double_type_node;
10791           field_ptr_t = double_ptr_type_node;
10792           break;
10793         case E_TFmode:
10794           field_t = long_double_type_node;
10795           field_ptr_t = long_double_ptr_type_node;
10796           break;
10797         case E_HFmode:
10798           field_t = aarch64_fp16_type_node;
10799           field_ptr_t = aarch64_fp16_ptr_type_node;
10800           break;
10801         case E_V2SImode:
10802         case E_V4SImode:
10803             {
10804               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10805               field_t = build_vector_type_for_mode (innertype, ag_mode);
10806               field_ptr_t = build_pointer_type (field_t);
10807             }
10808           break;
10809         default:
10810           gcc_assert (0);
10811         }
10812
10813       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10814       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10815       addr = t;
10816       t = fold_convert (field_ptr_t, addr);
10817       t = build2 (MODIFY_EXPR, field_t,
10818                   build1 (INDIRECT_REF, field_t, tmp_ha),
10819                   build1 (INDIRECT_REF, field_t, t));
10820
10821       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10822       for (i = 1; i < nregs; ++i)
10823         {
10824           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10825           u = fold_convert (field_ptr_t, addr);
10826           u = build2 (MODIFY_EXPR, field_t,
10827                       build2 (MEM_REF, field_t, tmp_ha,
10828                               build_int_cst (field_ptr_t,
10829                                              (i *
10830                                               int_size_in_bytes (field_t)))),
10831                       build1 (INDIRECT_REF, field_t, u));
10832           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10833         }
10834
10835       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10836       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10837     }
10838
10839   COND_EXPR_ELSE (cond2) = t;
10840   addr = fold_convert (build_pointer_type (type), cond1);
10841   addr = build_va_arg_indirect_ref (addr);
10842
10843   if (indirect_p)
10844     addr = build_va_arg_indirect_ref (addr);
10845
10846   return addr;
10847 }
10848
10849 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10850
10851 static void
10852 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10853                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10854                                 int no_rtl)
10855 {
10856   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10857   CUMULATIVE_ARGS local_cum;
10858   int gr_saved = cfun->va_list_gpr_size;
10859   int vr_saved = cfun->va_list_fpr_size;
10860
10861   /* The caller has advanced CUM up to, but not beyond, the last named
10862      argument.  Advance a local copy of CUM past the last "real" named
10863      argument, to find out how many registers are left over.  */
10864   local_cum = *cum;
10865   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10866
10867   /* Found out how many registers we need to save.
10868      Honor tree-stdvar analysis results.  */
10869   if (cfun->va_list_gpr_size)
10870     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10871                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10872   if (cfun->va_list_fpr_size)
10873     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10874                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10875
10876   if (!TARGET_FLOAT)
10877     {
10878       gcc_assert (local_cum.aapcs_nvrn == 0);
10879       vr_saved = 0;
10880     }
10881
10882   if (!no_rtl)
10883     {
10884       if (gr_saved > 0)
10885         {
10886           rtx ptr, mem;
10887
10888           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10889           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10890                                - gr_saved * UNITS_PER_WORD);
10891           mem = gen_frame_mem (BLKmode, ptr);
10892           set_mem_alias_set (mem, get_varargs_alias_set ());
10893
10894           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10895                                mem, gr_saved);
10896         }
10897       if (vr_saved > 0)
10898         {
10899           /* We can't use move_block_from_reg, because it will use
10900              the wrong mode, storing D regs only.  */
10901           machine_mode mode = TImode;
10902           int off, i, vr_start;
10903
10904           /* Set OFF to the offset from virtual_incoming_args_rtx of
10905              the first vector register.  The VR save area lies below
10906              the GR one, and is aligned to 16 bytes.  */
10907           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10908                            STACK_BOUNDARY / BITS_PER_UNIT);
10909           off -= vr_saved * UNITS_PER_VREG;
10910
10911           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10912           for (i = 0; i < vr_saved; ++i)
10913             {
10914               rtx ptr, mem;
10915
10916               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10917               mem = gen_frame_mem (mode, ptr);
10918               set_mem_alias_set (mem, get_varargs_alias_set ());
10919               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10920               off += UNITS_PER_VREG;
10921             }
10922         }
10923     }
10924
10925   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10926      any complication of having crtl->args.pretend_args_size changed.  */
10927   cfun->machine->frame.saved_varargs_size
10928     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10929                  STACK_BOUNDARY / BITS_PER_UNIT)
10930        + vr_saved * UNITS_PER_VREG);
10931 }
10932
10933 static void
10934 aarch64_conditional_register_usage (void)
10935 {
10936   int i;
10937   if (!TARGET_FLOAT)
10938     {
10939       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10940         {
10941           fixed_regs[i] = 1;
10942           call_used_regs[i] = 1;
10943         }
10944     }
10945 }
10946
10947 /* Walk down the type tree of TYPE counting consecutive base elements.
10948    If *MODEP is VOIDmode, then set it to the first valid floating point
10949    type.  If a non-floating point type is found, or if a floating point
10950    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10951    otherwise return the count in the sub-tree.  */
10952 static int
10953 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10954 {
10955   machine_mode mode;
10956   HOST_WIDE_INT size;
10957
10958   switch (TREE_CODE (type))
10959     {
10960     case REAL_TYPE:
10961       mode = TYPE_MODE (type);
10962       if (mode != DFmode && mode != SFmode
10963           && mode != TFmode && mode != HFmode)
10964         return -1;
10965
10966       if (*modep == VOIDmode)
10967         *modep = mode;
10968
10969       if (*modep == mode)
10970         return 1;
10971
10972       break;
10973
10974     case COMPLEX_TYPE:
10975       mode = TYPE_MODE (TREE_TYPE (type));
10976       if (mode != DFmode && mode != SFmode
10977           && mode != TFmode && mode != HFmode)
10978         return -1;
10979
10980       if (*modep == VOIDmode)
10981         *modep = mode;
10982
10983       if (*modep == mode)
10984         return 2;
10985
10986       break;
10987
10988     case VECTOR_TYPE:
10989       /* Use V2SImode and V4SImode as representatives of all 64-bit
10990          and 128-bit vector types.  */
10991       size = int_size_in_bytes (type);
10992       switch (size)
10993         {
10994         case 8:
10995           mode = V2SImode;
10996           break;
10997         case 16:
10998           mode = V4SImode;
10999           break;
11000         default:
11001           return -1;
11002         }
11003
11004       if (*modep == VOIDmode)
11005         *modep = mode;
11006
11007       /* Vector modes are considered to be opaque: two vectors are
11008          equivalent for the purposes of being homogeneous aggregates
11009          if they are the same size.  */
11010       if (*modep == mode)
11011         return 1;
11012
11013       break;
11014
11015     case ARRAY_TYPE:
11016       {
11017         int count;
11018         tree index = TYPE_DOMAIN (type);
11019
11020         /* Can't handle incomplete types nor sizes that are not
11021            fixed.  */
11022         if (!COMPLETE_TYPE_P (type)
11023             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11024           return -1;
11025
11026         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11027         if (count == -1
11028             || !index
11029             || !TYPE_MAX_VALUE (index)
11030             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11031             || !TYPE_MIN_VALUE (index)
11032             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11033             || count < 0)
11034           return -1;
11035
11036         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11037                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11038
11039         /* There must be no padding.  */
11040         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11041           return -1;
11042
11043         return count;
11044       }
11045
11046     case RECORD_TYPE:
11047       {
11048         int count = 0;
11049         int sub_count;
11050         tree field;
11051
11052         /* Can't handle incomplete types nor sizes that are not
11053            fixed.  */
11054         if (!COMPLETE_TYPE_P (type)
11055             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11056           return -1;
11057
11058         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11059           {
11060             if (TREE_CODE (field) != FIELD_DECL)
11061               continue;
11062
11063             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11064             if (sub_count < 0)
11065               return -1;
11066             count += sub_count;
11067           }
11068
11069         /* There must be no padding.  */
11070         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11071           return -1;
11072
11073         return count;
11074       }
11075
11076     case UNION_TYPE:
11077     case QUAL_UNION_TYPE:
11078       {
11079         /* These aren't very interesting except in a degenerate case.  */
11080         int count = 0;
11081         int sub_count;
11082         tree field;
11083
11084         /* Can't handle incomplete types nor sizes that are not
11085            fixed.  */
11086         if (!COMPLETE_TYPE_P (type)
11087             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11088           return -1;
11089
11090         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11091           {
11092             if (TREE_CODE (field) != FIELD_DECL)
11093               continue;
11094
11095             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11096             if (sub_count < 0)
11097               return -1;
11098             count = count > sub_count ? count : sub_count;
11099           }
11100
11101         /* There must be no padding.  */
11102         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11103           return -1;
11104
11105         return count;
11106       }
11107
11108     default:
11109       break;
11110     }
11111
11112   return -1;
11113 }
11114
11115 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11116    type as described in AAPCS64 \S 4.1.2.
11117
11118    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11119
11120 static bool
11121 aarch64_short_vector_p (const_tree type,
11122                         machine_mode mode)
11123 {
11124   HOST_WIDE_INT size = -1;
11125
11126   if (type && TREE_CODE (type) == VECTOR_TYPE)
11127     size = int_size_in_bytes (type);
11128   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11129             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11130     size = GET_MODE_SIZE (mode);
11131
11132   return (size == 8 || size == 16);
11133 }
11134
11135 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11136    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11137    array types.  The C99 floating-point complex types are also considered
11138    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11139    types, which are GCC extensions and out of the scope of AAPCS64, are
11140    treated as composite types here as well.
11141
11142    Note that MODE itself is not sufficient in determining whether a type
11143    is such a composite type or not.  This is because
11144    stor-layout.c:compute_record_mode may have already changed the MODE
11145    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11146    structure with only one field may have its MODE set to the mode of the
11147    field.  Also an integer mode whose size matches the size of the
11148    RECORD_TYPE type may be used to substitute the original mode
11149    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11150    solely relied on.  */
11151
11152 static bool
11153 aarch64_composite_type_p (const_tree type,
11154                           machine_mode mode)
11155 {
11156   if (aarch64_short_vector_p (type, mode))
11157     return false;
11158
11159   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11160     return true;
11161
11162   if (mode == BLKmode
11163       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11164       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11165     return true;
11166
11167   return false;
11168 }
11169
11170 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11171    shall be passed or returned in simd/fp register(s) (providing these
11172    parameter passing registers are available).
11173
11174    Upon successful return, *COUNT returns the number of needed registers,
11175    *BASE_MODE returns the mode of the individual register and when IS_HAF
11176    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11177    floating-point aggregate or a homogeneous short-vector aggregate.  */
11178
11179 static bool
11180 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11181                                          const_tree type,
11182                                          machine_mode *base_mode,
11183                                          int *count,
11184                                          bool *is_ha)
11185 {
11186   machine_mode new_mode = VOIDmode;
11187   bool composite_p = aarch64_composite_type_p (type, mode);
11188
11189   if (is_ha != NULL) *is_ha = false;
11190
11191   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11192       || aarch64_short_vector_p (type, mode))
11193     {
11194       *count = 1;
11195       new_mode = mode;
11196     }
11197   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11198     {
11199       if (is_ha != NULL) *is_ha = true;
11200       *count = 2;
11201       new_mode = GET_MODE_INNER (mode);
11202     }
11203   else if (type && composite_p)
11204     {
11205       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11206
11207       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11208         {
11209           if (is_ha != NULL) *is_ha = true;
11210           *count = ag_count;
11211         }
11212       else
11213         return false;
11214     }
11215   else
11216     return false;
11217
11218   *base_mode = new_mode;
11219   return true;
11220 }
11221
11222 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11223
11224 static rtx
11225 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11226                           int incoming ATTRIBUTE_UNUSED)
11227 {
11228   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11229 }
11230
11231 /* Implements target hook vector_mode_supported_p.  */
11232 static bool
11233 aarch64_vector_mode_supported_p (machine_mode mode)
11234 {
11235   if (TARGET_SIMD
11236       && (mode == V4SImode  || mode == V8HImode
11237           || mode == V16QImode || mode == V2DImode
11238           || mode == V2SImode  || mode == V4HImode
11239           || mode == V8QImode || mode == V2SFmode
11240           || mode == V4SFmode || mode == V2DFmode
11241           || mode == V4HFmode || mode == V8HFmode
11242           || mode == V1DFmode))
11243     return true;
11244
11245   return false;
11246 }
11247
11248 /* Return appropriate SIMD container
11249    for MODE within a vector of WIDTH bits.  */
11250 static machine_mode
11251 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11252 {
11253   gcc_assert (width == 64 || width == 128);
11254   if (TARGET_SIMD)
11255     {
11256       if (width == 128)
11257         switch (mode)
11258           {
11259           case E_DFmode:
11260             return V2DFmode;
11261           case E_SFmode:
11262             return V4SFmode;
11263           case E_HFmode:
11264             return V8HFmode;
11265           case E_SImode:
11266             return V4SImode;
11267           case E_HImode:
11268             return V8HImode;
11269           case E_QImode:
11270             return V16QImode;
11271           case E_DImode:
11272             return V2DImode;
11273           default:
11274             break;
11275           }
11276       else
11277         switch (mode)
11278           {
11279           case E_SFmode:
11280             return V2SFmode;
11281           case E_HFmode:
11282             return V4HFmode;
11283           case E_SImode:
11284             return V2SImode;
11285           case E_HImode:
11286             return V4HImode;
11287           case E_QImode:
11288             return V8QImode;
11289           default:
11290             break;
11291           }
11292     }
11293   return word_mode;
11294 }
11295
11296 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11297 static machine_mode
11298 aarch64_preferred_simd_mode (scalar_mode mode)
11299 {
11300   return aarch64_simd_container_mode (mode, 128);
11301 }
11302
11303 /* Return the bitmask of possible vector sizes for the vectorizer
11304    to iterate over.  */
11305 static unsigned int
11306 aarch64_autovectorize_vector_sizes (void)
11307 {
11308   return (16 | 8);
11309 }
11310
11311 /* Implement TARGET_MANGLE_TYPE.  */
11312
11313 static const char *
11314 aarch64_mangle_type (const_tree type)
11315 {
11316   /* The AArch64 ABI documents say that "__va_list" has to be
11317      managled as if it is in the "std" namespace.  */
11318   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11319     return "St9__va_list";
11320
11321   /* Half-precision float.  */
11322   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11323     return "Dh";
11324
11325   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11326      builtin types.  */
11327   if (TYPE_NAME (type) != NULL)
11328     return aarch64_mangle_builtin_type (type);
11329
11330   /* Use the default mangling.  */
11331   return NULL;
11332 }
11333
11334 /* Find the first rtx_insn before insn that will generate an assembly
11335    instruction.  */
11336
11337 static rtx_insn *
11338 aarch64_prev_real_insn (rtx_insn *insn)
11339 {
11340   if (!insn)
11341     return NULL;
11342
11343   do
11344     {
11345       insn = prev_real_insn (insn);
11346     }
11347   while (insn && recog_memoized (insn) < 0);
11348
11349   return insn;
11350 }
11351
11352 static bool
11353 is_madd_op (enum attr_type t1)
11354 {
11355   unsigned int i;
11356   /* A number of these may be AArch32 only.  */
11357   enum attr_type mlatypes[] = {
11358     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11359     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11360     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11361   };
11362
11363   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11364     {
11365       if (t1 == mlatypes[i])
11366         return true;
11367     }
11368
11369   return false;
11370 }
11371
11372 /* Check if there is a register dependency between a load and the insn
11373    for which we hold recog_data.  */
11374
11375 static bool
11376 dep_between_memop_and_curr (rtx memop)
11377 {
11378   rtx load_reg;
11379   int opno;
11380
11381   gcc_assert (GET_CODE (memop) == SET);
11382
11383   if (!REG_P (SET_DEST (memop)))
11384     return false;
11385
11386   load_reg = SET_DEST (memop);
11387   for (opno = 1; opno < recog_data.n_operands; opno++)
11388     {
11389       rtx operand = recog_data.operand[opno];
11390       if (REG_P (operand)
11391           && reg_overlap_mentioned_p (load_reg, operand))
11392         return true;
11393
11394     }
11395   return false;
11396 }
11397
11398
11399 /* When working around the Cortex-A53 erratum 835769,
11400    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11401    instruction and has a preceding memory instruction such that a NOP
11402    should be inserted between them.  */
11403
11404 bool
11405 aarch64_madd_needs_nop (rtx_insn* insn)
11406 {
11407   enum attr_type attr_type;
11408   rtx_insn *prev;
11409   rtx body;
11410
11411   if (!TARGET_FIX_ERR_A53_835769)
11412     return false;
11413
11414   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11415     return false;
11416
11417   attr_type = get_attr_type (insn);
11418   if (!is_madd_op (attr_type))
11419     return false;
11420
11421   prev = aarch64_prev_real_insn (insn);
11422   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11423      Restore recog state to INSN to avoid state corruption.  */
11424   extract_constrain_insn_cached (insn);
11425
11426   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11427     return false;
11428
11429   body = single_set (prev);
11430
11431   /* If the previous insn is a memory op and there is no dependency between
11432      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11433      have a complex memory operation, probably a load/store pair.
11434      Be conservative for now and emit a NOP.  */
11435   if (GET_MODE (recog_data.operand[0]) == DImode
11436       && (!body || !dep_between_memop_and_curr (body)))
11437     return true;
11438
11439   return false;
11440
11441 }
11442
11443
11444 /* Implement FINAL_PRESCAN_INSN.  */
11445
11446 void
11447 aarch64_final_prescan_insn (rtx_insn *insn)
11448 {
11449   if (aarch64_madd_needs_nop (insn))
11450     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11451 }
11452
11453
11454 /* Return the equivalent letter for size.  */
11455 static char
11456 sizetochar (int size)
11457 {
11458   switch (size)
11459     {
11460     case 64: return 'd';
11461     case 32: return 's';
11462     case 16: return 'h';
11463     case 8 : return 'b';
11464     default: gcc_unreachable ();
11465     }
11466 }
11467
11468 /* Return true iff x is a uniform vector of floating-point
11469    constants, and the constant can be represented in
11470    quarter-precision form.  Note, as aarch64_float_const_representable
11471    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11472 static bool
11473 aarch64_vect_float_const_representable_p (rtx x)
11474 {
11475   rtx elt;
11476   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11477           && const_vec_duplicate_p (x, &elt)
11478           && aarch64_float_const_representable_p (elt));
11479 }
11480
11481 /* Return true for valid and false for invalid.  */
11482 bool
11483 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11484                               struct simd_immediate_info *info)
11485 {
11486 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11487   matches = 1;                                          \
11488   for (i = 0; i < idx; i += (STRIDE))                   \
11489     if (!(TEST))                                        \
11490       matches = 0;                                      \
11491   if (matches)                                          \
11492     {                                                   \
11493       immtype = (CLASS);                                \
11494       elsize = (ELSIZE);                                \
11495       eshift = (SHIFT);                                 \
11496       emvn = (NEG);                                     \
11497       break;                                            \
11498     }
11499
11500   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11501   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11502   unsigned char bytes[16];
11503   int immtype = -1, matches;
11504   unsigned int invmask = inverse ? 0xff : 0;
11505   int eshift, emvn;
11506
11507   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11508     {
11509       if (! (aarch64_simd_imm_zero_p (op, mode)
11510              || aarch64_vect_float_const_representable_p (op)))
11511         return false;
11512
11513       if (info)
11514         {
11515           rtx elt = CONST_VECTOR_ELT (op, 0);
11516           scalar_float_mode elt_mode
11517             = as_a <scalar_float_mode> (GET_MODE (elt));
11518
11519           info->value = elt;
11520           info->element_width = GET_MODE_BITSIZE (elt_mode);
11521           info->mvn = false;
11522           info->shift = 0;
11523         }
11524
11525       return true;
11526     }
11527
11528   /* Splat vector constant out into a byte vector.  */
11529   for (i = 0; i < n_elts; i++)
11530     {
11531       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11532          it must be laid out in the vector register in reverse order.  */
11533       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11534       unsigned HOST_WIDE_INT elpart;
11535
11536       gcc_assert (CONST_INT_P (el));
11537       elpart = INTVAL (el);
11538
11539       for (unsigned int byte = 0; byte < innersize; byte++)
11540         {
11541           bytes[idx++] = (elpart & 0xff) ^ invmask;
11542           elpart >>= BITS_PER_UNIT;
11543         }
11544
11545     }
11546
11547   /* Sanity check.  */
11548   gcc_assert (idx == GET_MODE_SIZE (mode));
11549
11550   do
11551     {
11552       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11553              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11554
11555       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11556              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11557
11558       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11559              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11560
11561       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11562              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11563
11564       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11565
11566       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11567
11568       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11569              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11570
11571       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11572              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11573
11574       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11575              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11576
11577       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11578              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11579
11580       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11581
11582       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11583
11584       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11585              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11586
11587       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11588              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11589
11590       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11591              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11592
11593       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11594              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11595
11596       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11597
11598       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11599              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11600     }
11601   while (0);
11602
11603   if (immtype == -1)
11604     return false;
11605
11606   if (info)
11607     {
11608       info->element_width = elsize;
11609       info->mvn = emvn != 0;
11610       info->shift = eshift;
11611
11612       unsigned HOST_WIDE_INT imm = 0;
11613
11614       if (immtype >= 12 && immtype <= 15)
11615         info->msl = true;
11616
11617       /* Un-invert bytes of recognized vector, if necessary.  */
11618       if (invmask != 0)
11619         for (i = 0; i < idx; i++)
11620           bytes[i] ^= invmask;
11621
11622       if (immtype == 17)
11623         {
11624           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11625           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11626
11627           for (i = 0; i < 8; i++)
11628             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11629               << (i * BITS_PER_UNIT);
11630
11631
11632           info->value = GEN_INT (imm);
11633         }
11634       else
11635         {
11636           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11637             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11638
11639           /* Construct 'abcdefgh' because the assembler cannot handle
11640              generic constants.  */
11641           if (info->mvn)
11642             imm = ~imm;
11643           imm = (imm >> info->shift) & 0xff;
11644           info->value = GEN_INT (imm);
11645         }
11646     }
11647
11648   return true;
11649 #undef CHECK
11650 }
11651
11652 /* Check of immediate shift constants are within range.  */
11653 bool
11654 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11655 {
11656   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11657   if (left)
11658     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11659   else
11660     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11661 }
11662
11663 /* Return true if X is a uniform vector where all elements
11664    are either the floating-point constant 0.0 or the
11665    integer constant 0.  */
11666 bool
11667 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11668 {
11669   return x == CONST0_RTX (mode);
11670 }
11671
11672
11673 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11674    operation of width WIDTH at bit position POS.  */
11675
11676 rtx
11677 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11678 {
11679   gcc_assert (CONST_INT_P (width));
11680   gcc_assert (CONST_INT_P (pos));
11681
11682   unsigned HOST_WIDE_INT mask
11683     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11684   return GEN_INT (mask << UINTVAL (pos));
11685 }
11686
11687 bool
11688 aarch64_mov_operand_p (rtx x, machine_mode mode)
11689 {
11690   if (GET_CODE (x) == HIGH
11691       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11692     return true;
11693
11694   if (CONST_INT_P (x))
11695     return true;
11696
11697   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11698     return true;
11699
11700   return aarch64_classify_symbolic_expression (x)
11701     == SYMBOL_TINY_ABSOLUTE;
11702 }
11703
11704 /* Return a const_int vector of VAL.  */
11705 rtx
11706 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11707 {
11708   int nunits = GET_MODE_NUNITS (mode);
11709   rtvec v = rtvec_alloc (nunits);
11710   int i;
11711
11712   rtx cache = GEN_INT (val);
11713
11714   for (i=0; i < nunits; i++)
11715     RTVEC_ELT (v, i) = cache;
11716
11717   return gen_rtx_CONST_VECTOR (mode, v);
11718 }
11719
11720 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11721
11722 bool
11723 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11724 {
11725   machine_mode vmode;
11726
11727   vmode = aarch64_preferred_simd_mode (mode);
11728   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11729   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11730 }
11731
11732 /* Construct and return a PARALLEL RTX vector with elements numbering the
11733    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11734    the vector - from the perspective of the architecture.  This does not
11735    line up with GCC's perspective on lane numbers, so we end up with
11736    different masks depending on our target endian-ness.  The diagram
11737    below may help.  We must draw the distinction when building masks
11738    which select one half of the vector.  An instruction selecting
11739    architectural low-lanes for a big-endian target, must be described using
11740    a mask selecting GCC high-lanes.
11741
11742                  Big-Endian             Little-Endian
11743
11744 GCC             0   1   2   3           3   2   1   0
11745               | x | x | x | x |       | x | x | x | x |
11746 Architecture    3   2   1   0           3   2   1   0
11747
11748 Low Mask:         { 2, 3 }                { 0, 1 }
11749 High Mask:        { 0, 1 }                { 2, 3 }
11750 */
11751
11752 rtx
11753 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11754 {
11755   int nunits = GET_MODE_NUNITS (mode);
11756   rtvec v = rtvec_alloc (nunits / 2);
11757   int high_base = nunits / 2;
11758   int low_base = 0;
11759   int base;
11760   rtx t1;
11761   int i;
11762
11763   if (BYTES_BIG_ENDIAN)
11764     base = high ? low_base : high_base;
11765   else
11766     base = high ? high_base : low_base;
11767
11768   for (i = 0; i < nunits / 2; i++)
11769     RTVEC_ELT (v, i) = GEN_INT (base + i);
11770
11771   t1 = gen_rtx_PARALLEL (mode, v);
11772   return t1;
11773 }
11774
11775 /* Check OP for validity as a PARALLEL RTX vector with elements
11776    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11777    from the perspective of the architecture.  See the diagram above
11778    aarch64_simd_vect_par_cnst_half for more details.  */
11779
11780 bool
11781 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11782                                        bool high)
11783 {
11784   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11785   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11786   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11787   int i = 0;
11788
11789   if (!VECTOR_MODE_P (mode))
11790     return false;
11791
11792   if (count_op != count_ideal)
11793     return false;
11794
11795   for (i = 0; i < count_ideal; i++)
11796     {
11797       rtx elt_op = XVECEXP (op, 0, i);
11798       rtx elt_ideal = XVECEXP (ideal, 0, i);
11799
11800       if (!CONST_INT_P (elt_op)
11801           || INTVAL (elt_ideal) != INTVAL (elt_op))
11802         return false;
11803     }
11804   return true;
11805 }
11806
11807 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11808    HIGH (exclusive).  */
11809 void
11810 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11811                           const_tree exp)
11812 {
11813   HOST_WIDE_INT lane;
11814   gcc_assert (CONST_INT_P (operand));
11815   lane = INTVAL (operand);
11816
11817   if (lane < low || lane >= high)
11818   {
11819     if (exp)
11820       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11821     else
11822       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11823   }
11824 }
11825
11826 /* Return TRUE if OP is a valid vector addressing mode.  */
11827 bool
11828 aarch64_simd_mem_operand_p (rtx op)
11829 {
11830   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11831                         || REG_P (XEXP (op, 0)));
11832 }
11833
11834 /* Emit a register copy from operand to operand, taking care not to
11835    early-clobber source registers in the process.
11836
11837    COUNT is the number of components into which the copy needs to be
11838    decomposed.  */
11839 void
11840 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11841                                 unsigned int count)
11842 {
11843   unsigned int i;
11844   int rdest = REGNO (operands[0]);
11845   int rsrc = REGNO (operands[1]);
11846
11847   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11848       || rdest < rsrc)
11849     for (i = 0; i < count; i++)
11850       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11851                       gen_rtx_REG (mode, rsrc + i));
11852   else
11853     for (i = 0; i < count; i++)
11854       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11855                       gen_rtx_REG (mode, rsrc + count - i - 1));
11856 }
11857
11858 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11859    one of VSTRUCT modes: OI, CI, or XI.  */
11860 int
11861 aarch64_simd_attr_length_rglist (machine_mode mode)
11862 {
11863   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11864 }
11865
11866 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11867    alignment of a vector to 128 bits.  */
11868 static HOST_WIDE_INT
11869 aarch64_simd_vector_alignment (const_tree type)
11870 {
11871   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11872   return MIN (align, 128);
11873 }
11874
11875 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11876 static bool
11877 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11878 {
11879   if (is_packed)
11880     return false;
11881
11882   /* We guarantee alignment for vectors up to 128-bits.  */
11883   if (tree_int_cst_compare (TYPE_SIZE (type),
11884                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11885     return false;
11886
11887   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11888   return true;
11889 }
11890
11891 /* Return true if the vector misalignment factor is supported by the
11892    target.  */
11893 static bool
11894 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11895                                              const_tree type, int misalignment,
11896                                              bool is_packed)
11897 {
11898   if (TARGET_SIMD && STRICT_ALIGNMENT)
11899     {
11900       /* Return if movmisalign pattern is not supported for this mode.  */
11901       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11902         return false;
11903
11904       if (misalignment == -1)
11905         {
11906           /* Misalignment factor is unknown at compile time but we know
11907              it's word aligned.  */
11908           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11909             {
11910               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11911
11912               if (element_size != 64)
11913                 return true;
11914             }
11915           return false;
11916         }
11917     }
11918   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11919                                                       is_packed);
11920 }
11921
11922 /* If VALS is a vector constant that can be loaded into a register
11923    using DUP, generate instructions to do so and return an RTX to
11924    assign to the register.  Otherwise return NULL_RTX.  */
11925 static rtx
11926 aarch64_simd_dup_constant (rtx vals)
11927 {
11928   machine_mode mode = GET_MODE (vals);
11929   machine_mode inner_mode = GET_MODE_INNER (mode);
11930   rtx x;
11931
11932   if (!const_vec_duplicate_p (vals, &x))
11933     return NULL_RTX;
11934
11935   /* We can load this constant by using DUP and a constant in a
11936      single ARM register.  This will be cheaper than a vector
11937      load.  */
11938   x = copy_to_mode_reg (inner_mode, x);
11939   return gen_rtx_VEC_DUPLICATE (mode, x);
11940 }
11941
11942
11943 /* Generate code to load VALS, which is a PARALLEL containing only
11944    constants (for vec_init) or CONST_VECTOR, efficiently into a
11945    register.  Returns an RTX to copy into the register, or NULL_RTX
11946    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11947 static rtx
11948 aarch64_simd_make_constant (rtx vals)
11949 {
11950   machine_mode mode = GET_MODE (vals);
11951   rtx const_dup;
11952   rtx const_vec = NULL_RTX;
11953   int n_elts = GET_MODE_NUNITS (mode);
11954   int n_const = 0;
11955   int i;
11956
11957   if (GET_CODE (vals) == CONST_VECTOR)
11958     const_vec = vals;
11959   else if (GET_CODE (vals) == PARALLEL)
11960     {
11961       /* A CONST_VECTOR must contain only CONST_INTs and
11962          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11963          Only store valid constants in a CONST_VECTOR.  */
11964       for (i = 0; i < n_elts; ++i)
11965         {
11966           rtx x = XVECEXP (vals, 0, i);
11967           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11968             n_const++;
11969         }
11970       if (n_const == n_elts)
11971         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11972     }
11973   else
11974     gcc_unreachable ();
11975
11976   if (const_vec != NULL_RTX
11977       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11978     /* Load using MOVI/MVNI.  */
11979     return const_vec;
11980   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11981     /* Loaded using DUP.  */
11982     return const_dup;
11983   else if (const_vec != NULL_RTX)
11984     /* Load from constant pool. We can not take advantage of single-cycle
11985        LD1 because we need a PC-relative addressing mode.  */
11986     return const_vec;
11987   else
11988     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11989        We can not construct an initializer.  */
11990     return NULL_RTX;
11991 }
11992
11993 /* Expand a vector initialisation sequence, such that TARGET is
11994    initialised to contain VALS.  */
11995
11996 void
11997 aarch64_expand_vector_init (rtx target, rtx vals)
11998 {
11999   machine_mode mode = GET_MODE (target);
12000   scalar_mode inner_mode = GET_MODE_INNER (mode);
12001   /* The number of vector elements.  */
12002   int n_elts = GET_MODE_NUNITS (mode);
12003   /* The number of vector elements which are not constant.  */
12004   int n_var = 0;
12005   rtx any_const = NULL_RTX;
12006   /* The first element of vals.  */
12007   rtx v0 = XVECEXP (vals, 0, 0);
12008   bool all_same = true;
12009
12010   /* Count the number of variable elements to initialise.  */
12011   for (int i = 0; i < n_elts; ++i)
12012     {
12013       rtx x = XVECEXP (vals, 0, i);
12014       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12015         ++n_var;
12016       else
12017         any_const = x;
12018
12019       all_same &= rtx_equal_p (x, v0);
12020     }
12021
12022   /* No variable elements, hand off to aarch64_simd_make_constant which knows
12023      how best to handle this.  */
12024   if (n_var == 0)
12025     {
12026       rtx constant = aarch64_simd_make_constant (vals);
12027       if (constant != NULL_RTX)
12028         {
12029           emit_move_insn (target, constant);
12030           return;
12031         }
12032     }
12033
12034   /* Splat a single non-constant element if we can.  */
12035   if (all_same)
12036     {
12037       rtx x = copy_to_mode_reg (inner_mode, v0);
12038       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12039       return;
12040     }
12041
12042   enum insn_code icode = optab_handler (vec_set_optab, mode);
12043   gcc_assert (icode != CODE_FOR_nothing);
12044
12045   /* If there are only variable elements, try to optimize
12046      the insertion using dup for the most common element
12047      followed by insertions.  */
12048
12049   /* The algorithm will fill matches[*][0] with the earliest matching element,
12050      and matches[X][1] with the count of duplicate elements (if X is the
12051      earliest element which has duplicates).  */
12052
12053   if (n_var == n_elts && n_elts <= 16)
12054     {
12055       int matches[16][2] = {0};
12056       for (int i = 0; i < n_elts; i++)
12057         {
12058           for (int j = 0; j <= i; j++)
12059             {
12060               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12061                 {
12062                   matches[i][0] = j;
12063                   matches[j][1]++;
12064                   break;
12065                 }
12066             }
12067         }
12068       int maxelement = 0;
12069       int maxv = 0;
12070       for (int i = 0; i < n_elts; i++)
12071         if (matches[i][1] > maxv)
12072           {
12073             maxelement = i;
12074             maxv = matches[i][1];
12075           }
12076
12077       /* Create a duplicate of the most common element.  */
12078       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12079       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12080
12081       /* Insert the rest.  */
12082       for (int i = 0; i < n_elts; i++)
12083         {
12084           rtx x = XVECEXP (vals, 0, i);
12085           if (matches[i][0] == maxelement)
12086             continue;
12087           x = copy_to_mode_reg (inner_mode, x);
12088           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12089         }
12090       return;
12091     }
12092
12093   /* Initialise a vector which is part-variable.  We want to first try
12094      to build those lanes which are constant in the most efficient way we
12095      can.  */
12096   if (n_var != n_elts)
12097     {
12098       rtx copy = copy_rtx (vals);
12099
12100       /* Load constant part of vector.  We really don't care what goes into the
12101          parts we will overwrite, but we're more likely to be able to load the
12102          constant efficiently if it has fewer, larger, repeating parts
12103          (see aarch64_simd_valid_immediate).  */
12104       for (int i = 0; i < n_elts; i++)
12105         {
12106           rtx x = XVECEXP (vals, 0, i);
12107           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12108             continue;
12109           rtx subst = any_const;
12110           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12111             {
12112               /* Look in the copied vector, as more elements are const.  */
12113               rtx test = XVECEXP (copy, 0, i ^ bit);
12114               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12115                 {
12116                   subst = test;
12117                   break;
12118                 }
12119             }
12120           XVECEXP (copy, 0, i) = subst;
12121         }
12122       aarch64_expand_vector_init (target, copy);
12123     }
12124
12125   /* Insert the variable lanes directly.  */
12126   for (int i = 0; i < n_elts; i++)
12127     {
12128       rtx x = XVECEXP (vals, 0, i);
12129       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12130         continue;
12131       x = copy_to_mode_reg (inner_mode, x);
12132       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12133     }
12134 }
12135
12136 static unsigned HOST_WIDE_INT
12137 aarch64_shift_truncation_mask (machine_mode mode)
12138 {
12139   return
12140     (!SHIFT_COUNT_TRUNCATED
12141      || aarch64_vector_mode_supported_p (mode)
12142      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12143 }
12144
12145 /* Select a format to encode pointers in exception handling data.  */
12146 int
12147 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12148 {
12149    int type;
12150    switch (aarch64_cmodel)
12151      {
12152      case AARCH64_CMODEL_TINY:
12153      case AARCH64_CMODEL_TINY_PIC:
12154      case AARCH64_CMODEL_SMALL:
12155      case AARCH64_CMODEL_SMALL_PIC:
12156      case AARCH64_CMODEL_SMALL_SPIC:
12157        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12158           for everything.  */
12159        type = DW_EH_PE_sdata4;
12160        break;
12161      default:
12162        /* No assumptions here.  8-byte relocs required.  */
12163        type = DW_EH_PE_sdata8;
12164        break;
12165      }
12166    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12167 }
12168
12169 /* The last .arch and .tune assembly strings that we printed.  */
12170 static std::string aarch64_last_printed_arch_string;
12171 static std::string aarch64_last_printed_tune_string;
12172
12173 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12174    by the function fndecl.  */
12175
12176 void
12177 aarch64_declare_function_name (FILE *stream, const char* name,
12178                                 tree fndecl)
12179 {
12180   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12181
12182   struct cl_target_option *targ_options;
12183   if (target_parts)
12184     targ_options = TREE_TARGET_OPTION (target_parts);
12185   else
12186     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12187   gcc_assert (targ_options);
12188
12189   const struct processor *this_arch
12190     = aarch64_get_arch (targ_options->x_explicit_arch);
12191
12192   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12193   std::string extension
12194     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12195                                                   this_arch->flags);
12196   /* Only update the assembler .arch string if it is distinct from the last
12197      such string we printed.  */
12198   std::string to_print = this_arch->name + extension;
12199   if (to_print != aarch64_last_printed_arch_string)
12200     {
12201       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12202       aarch64_last_printed_arch_string = to_print;
12203     }
12204
12205   /* Print the cpu name we're tuning for in the comments, might be
12206      useful to readers of the generated asm.  Do it only when it changes
12207      from function to function and verbose assembly is requested.  */
12208   const struct processor *this_tune
12209     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12210
12211   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12212     {
12213       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12214                    this_tune->name);
12215       aarch64_last_printed_tune_string = this_tune->name;
12216     }
12217
12218   /* Don't forget the type directive for ELF.  */
12219   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12220   ASM_OUTPUT_LABEL (stream, name);
12221 }
12222
12223 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12224
12225 static void
12226 aarch64_start_file (void)
12227 {
12228   struct cl_target_option *default_options
12229     = TREE_TARGET_OPTION (target_option_default_node);
12230
12231   const struct processor *default_arch
12232     = aarch64_get_arch (default_options->x_explicit_arch);
12233   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12234   std::string extension
12235     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12236                                                   default_arch->flags);
12237
12238    aarch64_last_printed_arch_string = default_arch->name + extension;
12239    aarch64_last_printed_tune_string = "";
12240    asm_fprintf (asm_out_file, "\t.arch %s\n",
12241                 aarch64_last_printed_arch_string.c_str ());
12242
12243    default_file_start ();
12244 }
12245
12246 /* Emit load exclusive.  */
12247
12248 static void
12249 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12250                              rtx mem, rtx model_rtx)
12251 {
12252   rtx (*gen) (rtx, rtx, rtx);
12253
12254   switch (mode)
12255     {
12256     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12257     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12258     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12259     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12260     default:
12261       gcc_unreachable ();
12262     }
12263
12264   emit_insn (gen (rval, mem, model_rtx));
12265 }
12266
12267 /* Emit store exclusive.  */
12268
12269 static void
12270 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12271                               rtx rval, rtx mem, rtx model_rtx)
12272 {
12273   rtx (*gen) (rtx, rtx, rtx, rtx);
12274
12275   switch (mode)
12276     {
12277     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12278     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12279     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12280     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12281     default:
12282       gcc_unreachable ();
12283     }
12284
12285   emit_insn (gen (bval, rval, mem, model_rtx));
12286 }
12287
12288 /* Mark the previous jump instruction as unlikely.  */
12289
12290 static void
12291 aarch64_emit_unlikely_jump (rtx insn)
12292 {
12293   rtx_insn *jump = emit_jump_insn (insn);
12294   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12295 }
12296
12297 /* Expand a compare and swap pattern.  */
12298
12299 void
12300 aarch64_expand_compare_and_swap (rtx operands[])
12301 {
12302   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12303   machine_mode mode, cmp_mode;
12304   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12305   int idx;
12306   gen_cas_fn gen;
12307   const gen_cas_fn split_cas[] =
12308   {
12309     gen_aarch64_compare_and_swapqi,
12310     gen_aarch64_compare_and_swaphi,
12311     gen_aarch64_compare_and_swapsi,
12312     gen_aarch64_compare_and_swapdi
12313   };
12314   const gen_cas_fn atomic_cas[] =
12315   {
12316     gen_aarch64_compare_and_swapqi_lse,
12317     gen_aarch64_compare_and_swaphi_lse,
12318     gen_aarch64_compare_and_swapsi_lse,
12319     gen_aarch64_compare_and_swapdi_lse
12320   };
12321
12322   bval = operands[0];
12323   rval = operands[1];
12324   mem = operands[2];
12325   oldval = operands[3];
12326   newval = operands[4];
12327   is_weak = operands[5];
12328   mod_s = operands[6];
12329   mod_f = operands[7];
12330   mode = GET_MODE (mem);
12331   cmp_mode = mode;
12332
12333   /* Normally the succ memory model must be stronger than fail, but in the
12334      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12335      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12336
12337   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12338       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12339     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12340
12341   switch (mode)
12342     {
12343     case E_QImode:
12344     case E_HImode:
12345       /* For short modes, we're going to perform the comparison in SImode,
12346          so do the zero-extension now.  */
12347       cmp_mode = SImode;
12348       rval = gen_reg_rtx (SImode);
12349       oldval = convert_modes (SImode, mode, oldval, true);
12350       /* Fall through.  */
12351
12352     case E_SImode:
12353     case E_DImode:
12354       /* Force the value into a register if needed.  */
12355       if (!aarch64_plus_operand (oldval, mode))
12356         oldval = force_reg (cmp_mode, oldval);
12357       break;
12358
12359     default:
12360       gcc_unreachable ();
12361     }
12362
12363   switch (mode)
12364     {
12365     case E_QImode: idx = 0; break;
12366     case E_HImode: idx = 1; break;
12367     case E_SImode: idx = 2; break;
12368     case E_DImode: idx = 3; break;
12369     default:
12370       gcc_unreachable ();
12371     }
12372   if (TARGET_LSE)
12373     gen = atomic_cas[idx];
12374   else
12375     gen = split_cas[idx];
12376
12377   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12378
12379   if (mode == QImode || mode == HImode)
12380     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12381
12382   x = gen_rtx_REG (CCmode, CC_REGNUM);
12383   x = gen_rtx_EQ (SImode, x, const0_rtx);
12384   emit_insn (gen_rtx_SET (bval, x));
12385 }
12386
12387 /* Test whether the target supports using a atomic load-operate instruction.
12388    CODE is the operation and AFTER is TRUE if the data in memory after the
12389    operation should be returned and FALSE if the data before the operation
12390    should be returned.  Returns FALSE if the operation isn't supported by the
12391    architecture.  */
12392
12393 bool
12394 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12395 {
12396   if (!TARGET_LSE)
12397     return false;
12398
12399   switch (code)
12400     {
12401     case SET:
12402     case AND:
12403     case IOR:
12404     case XOR:
12405     case MINUS:
12406     case PLUS:
12407       return true;
12408     default:
12409       return false;
12410     }
12411 }
12412
12413 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12414    sequence implementing an atomic operation.  */
12415
12416 static void
12417 aarch64_emit_post_barrier (enum memmodel model)
12418 {
12419   const enum memmodel base_model = memmodel_base (model);
12420
12421   if (is_mm_sync (model)
12422       && (base_model == MEMMODEL_ACQUIRE
12423           || base_model == MEMMODEL_ACQ_REL
12424           || base_model == MEMMODEL_SEQ_CST))
12425     {
12426       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12427     }
12428 }
12429
12430 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12431    for the data in memory.  EXPECTED is the value expected to be in memory.
12432    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12433    is the memory ordering to use.  */
12434
12435 void
12436 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12437                         rtx expected, rtx desired,
12438                         rtx model)
12439 {
12440   rtx (*gen) (rtx, rtx, rtx, rtx);
12441   machine_mode mode;
12442
12443   mode = GET_MODE (mem);
12444
12445   switch (mode)
12446     {
12447     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12448     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12449     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12450     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12451     default:
12452       gcc_unreachable ();
12453     }
12454
12455   /* Move the expected value into the CAS destination register.  */
12456   emit_insn (gen_rtx_SET (rval, expected));
12457
12458   /* Emit the CAS.  */
12459   emit_insn (gen (rval, mem, desired, model));
12460
12461   /* Compare the expected value with the value loaded by the CAS, to establish
12462      whether the swap was made.  */
12463   aarch64_gen_compare_reg (EQ, rval, expected);
12464 }
12465
12466 /* Split a compare and swap pattern.  */
12467
12468 void
12469 aarch64_split_compare_and_swap (rtx operands[])
12470 {
12471   rtx rval, mem, oldval, newval, scratch;
12472   machine_mode mode;
12473   bool is_weak;
12474   rtx_code_label *label1, *label2;
12475   rtx x, cond;
12476   enum memmodel model;
12477   rtx model_rtx;
12478
12479   rval = operands[0];
12480   mem = operands[1];
12481   oldval = operands[2];
12482   newval = operands[3];
12483   is_weak = (operands[4] != const0_rtx);
12484   model_rtx = operands[5];
12485   scratch = operands[7];
12486   mode = GET_MODE (mem);
12487   model = memmodel_from_int (INTVAL (model_rtx));
12488
12489   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12490     loop:
12491     .label1:
12492         LD[A]XR rval, [mem]
12493         CBNZ    rval, .label2
12494         ST[L]XR scratch, newval, [mem]
12495         CBNZ    scratch, .label1
12496     .label2:
12497         CMP     rval, 0.  */
12498   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12499
12500   label1 = NULL;
12501   if (!is_weak)
12502     {
12503       label1 = gen_label_rtx ();
12504       emit_label (label1);
12505     }
12506   label2 = gen_label_rtx ();
12507
12508   /* The initial load can be relaxed for a __sync operation since a final
12509      barrier will be emitted to stop code hoisting.  */
12510   if (is_mm_sync (model))
12511     aarch64_emit_load_exclusive (mode, rval, mem,
12512                                  GEN_INT (MEMMODEL_RELAXED));
12513   else
12514     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12515
12516   if (strong_zero_p)
12517     {
12518       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12519       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12520                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12521       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12522     }
12523   else
12524     {
12525       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12526       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12527       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12528                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12529       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12530     }
12531
12532   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12533
12534   if (!is_weak)
12535     {
12536       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12537       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12538                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12539       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12540     }
12541   else
12542     {
12543       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12544       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12545       emit_insn (gen_rtx_SET (cond, x));
12546     }
12547
12548   emit_label (label2);
12549   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12550      to set the condition flags.  If this is not used it will be removed by
12551      later passes.  */
12552   if (strong_zero_p)
12553     {
12554       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12555       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12556       emit_insn (gen_rtx_SET (cond, x));
12557     }
12558   /* Emit any final barrier needed for a __sync operation.  */
12559   if (is_mm_sync (model))
12560     aarch64_emit_post_barrier (model);
12561 }
12562
12563 /* Emit a BIC instruction.  */
12564
12565 static void
12566 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12567 {
12568   rtx shift_rtx = GEN_INT (shift);
12569   rtx (*gen) (rtx, rtx, rtx, rtx);
12570
12571   switch (mode)
12572     {
12573     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12574     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12575     default:
12576       gcc_unreachable ();
12577     }
12578
12579   emit_insn (gen (dst, s2, shift_rtx, s1));
12580 }
12581
12582 /* Emit an atomic swap.  */
12583
12584 static void
12585 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12586                           rtx mem, rtx model)
12587 {
12588   rtx (*gen) (rtx, rtx, rtx, rtx);
12589
12590   switch (mode)
12591     {
12592     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12593     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12594     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12595     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12596     default:
12597       gcc_unreachable ();
12598     }
12599
12600   emit_insn (gen (dst, mem, value, model));
12601 }
12602
12603 /* Operations supported by aarch64_emit_atomic_load_op.  */
12604
12605 enum aarch64_atomic_load_op_code
12606 {
12607   AARCH64_LDOP_PLUS,    /* A + B  */
12608   AARCH64_LDOP_XOR,     /* A ^ B  */
12609   AARCH64_LDOP_OR,      /* A | B  */
12610   AARCH64_LDOP_BIC      /* A & ~B  */
12611 };
12612
12613 /* Emit an atomic load-operate.  */
12614
12615 static void
12616 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12617                              machine_mode mode, rtx dst, rtx src,
12618                              rtx mem, rtx model)
12619 {
12620   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12621   const aarch64_atomic_load_op_fn plus[] =
12622   {
12623     gen_aarch64_atomic_loadaddqi,
12624     gen_aarch64_atomic_loadaddhi,
12625     gen_aarch64_atomic_loadaddsi,
12626     gen_aarch64_atomic_loadadddi
12627   };
12628   const aarch64_atomic_load_op_fn eor[] =
12629   {
12630     gen_aarch64_atomic_loadeorqi,
12631     gen_aarch64_atomic_loadeorhi,
12632     gen_aarch64_atomic_loadeorsi,
12633     gen_aarch64_atomic_loadeordi
12634   };
12635   const aarch64_atomic_load_op_fn ior[] =
12636   {
12637     gen_aarch64_atomic_loadsetqi,
12638     gen_aarch64_atomic_loadsethi,
12639     gen_aarch64_atomic_loadsetsi,
12640     gen_aarch64_atomic_loadsetdi
12641   };
12642   const aarch64_atomic_load_op_fn bic[] =
12643   {
12644     gen_aarch64_atomic_loadclrqi,
12645     gen_aarch64_atomic_loadclrhi,
12646     gen_aarch64_atomic_loadclrsi,
12647     gen_aarch64_atomic_loadclrdi
12648   };
12649   aarch64_atomic_load_op_fn gen;
12650   int idx = 0;
12651
12652   switch (mode)
12653     {
12654     case E_QImode: idx = 0; break;
12655     case E_HImode: idx = 1; break;
12656     case E_SImode: idx = 2; break;
12657     case E_DImode: idx = 3; break;
12658     default:
12659       gcc_unreachable ();
12660     }
12661
12662   switch (code)
12663     {
12664     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12665     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12666     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12667     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12668     default:
12669       gcc_unreachable ();
12670     }
12671
12672   emit_insn (gen (dst, mem, src, model));
12673 }
12674
12675 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12676    location to store the data read from memory.  OUT_RESULT is the location to
12677    store the result of the operation.  MEM is the memory location to read and
12678    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12679    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12680    be NULL.  */
12681
12682 void
12683 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12684                          rtx mem, rtx value, rtx model_rtx)
12685 {
12686   machine_mode mode = GET_MODE (mem);
12687   machine_mode wmode = (mode == DImode ? DImode : SImode);
12688   const bool short_mode = (mode < SImode);
12689   aarch64_atomic_load_op_code ldop_code;
12690   rtx src;
12691   rtx x;
12692
12693   if (out_data)
12694     out_data = gen_lowpart (mode, out_data);
12695
12696   if (out_result)
12697     out_result = gen_lowpart (mode, out_result);
12698
12699   /* Make sure the value is in a register, putting it into a destination
12700      register if it needs to be manipulated.  */
12701   if (!register_operand (value, mode)
12702       || code == AND || code == MINUS)
12703     {
12704       src = out_result ? out_result : out_data;
12705       emit_move_insn (src, gen_lowpart (mode, value));
12706     }
12707   else
12708     src = value;
12709   gcc_assert (register_operand (src, mode));
12710
12711   /* Preprocess the data for the operation as necessary.  If the operation is
12712      a SET then emit a swap instruction and finish.  */
12713   switch (code)
12714     {
12715     case SET:
12716       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12717       return;
12718
12719     case MINUS:
12720       /* Negate the value and treat it as a PLUS.  */
12721       {
12722         rtx neg_src;
12723
12724         /* Resize the value if necessary.  */
12725         if (short_mode)
12726           src = gen_lowpart (wmode, src);
12727
12728         neg_src = gen_rtx_NEG (wmode, src);
12729         emit_insn (gen_rtx_SET (src, neg_src));
12730
12731         if (short_mode)
12732           src = gen_lowpart (mode, src);
12733       }
12734       /* Fall-through.  */
12735     case PLUS:
12736       ldop_code = AARCH64_LDOP_PLUS;
12737       break;
12738
12739     case IOR:
12740       ldop_code = AARCH64_LDOP_OR;
12741       break;
12742
12743     case XOR:
12744       ldop_code = AARCH64_LDOP_XOR;
12745       break;
12746
12747     case AND:
12748       {
12749         rtx not_src;
12750
12751         /* Resize the value if necessary.  */
12752         if (short_mode)
12753           src = gen_lowpart (wmode, src);
12754
12755         not_src = gen_rtx_NOT (wmode, src);
12756         emit_insn (gen_rtx_SET (src, not_src));
12757
12758         if (short_mode)
12759           src = gen_lowpart (mode, src);
12760       }
12761       ldop_code = AARCH64_LDOP_BIC;
12762       break;
12763
12764     default:
12765       /* The operation can't be done with atomic instructions.  */
12766       gcc_unreachable ();
12767     }
12768
12769   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12770
12771   /* If necessary, calculate the data in memory after the update by redoing the
12772      operation from values in registers.  */
12773   if (!out_result)
12774     return;
12775
12776   if (short_mode)
12777     {
12778       src = gen_lowpart (wmode, src);
12779       out_data = gen_lowpart (wmode, out_data);
12780       out_result = gen_lowpart (wmode, out_result);
12781     }
12782
12783   x = NULL_RTX;
12784
12785   switch (code)
12786     {
12787     case MINUS:
12788     case PLUS:
12789       x = gen_rtx_PLUS (wmode, out_data, src);
12790       break;
12791     case IOR:
12792       x = gen_rtx_IOR (wmode, out_data, src);
12793       break;
12794     case XOR:
12795       x = gen_rtx_XOR (wmode, out_data, src);
12796       break;
12797     case AND:
12798       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12799       return;
12800     default:
12801       gcc_unreachable ();
12802     }
12803
12804   emit_set_insn (out_result, x);
12805
12806   return;
12807 }
12808
12809 /* Split an atomic operation.  */
12810
12811 void
12812 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12813                          rtx value, rtx model_rtx, rtx cond)
12814 {
12815   machine_mode mode = GET_MODE (mem);
12816   machine_mode wmode = (mode == DImode ? DImode : SImode);
12817   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12818   const bool is_sync = is_mm_sync (model);
12819   rtx_code_label *label;
12820   rtx x;
12821
12822   /* Split the atomic operation into a sequence.  */
12823   label = gen_label_rtx ();
12824   emit_label (label);
12825
12826   if (new_out)
12827     new_out = gen_lowpart (wmode, new_out);
12828   if (old_out)
12829     old_out = gen_lowpart (wmode, old_out);
12830   else
12831     old_out = new_out;
12832   value = simplify_gen_subreg (wmode, value, mode, 0);
12833
12834   /* The initial load can be relaxed for a __sync operation since a final
12835      barrier will be emitted to stop code hoisting.  */
12836  if (is_sync)
12837     aarch64_emit_load_exclusive (mode, old_out, mem,
12838                                  GEN_INT (MEMMODEL_RELAXED));
12839   else
12840     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12841
12842   switch (code)
12843     {
12844     case SET:
12845       new_out = value;
12846       break;
12847
12848     case NOT:
12849       x = gen_rtx_AND (wmode, old_out, value);
12850       emit_insn (gen_rtx_SET (new_out, x));
12851       x = gen_rtx_NOT (wmode, new_out);
12852       emit_insn (gen_rtx_SET (new_out, x));
12853       break;
12854
12855     case MINUS:
12856       if (CONST_INT_P (value))
12857         {
12858           value = GEN_INT (-INTVAL (value));
12859           code = PLUS;
12860         }
12861       /* Fall through.  */
12862
12863     default:
12864       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12865       emit_insn (gen_rtx_SET (new_out, x));
12866       break;
12867     }
12868
12869   aarch64_emit_store_exclusive (mode, cond, mem,
12870                                 gen_lowpart (mode, new_out), model_rtx);
12871
12872   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12873   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12874                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12875   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12876
12877   /* Emit any final barrier needed for a __sync operation.  */
12878   if (is_sync)
12879     aarch64_emit_post_barrier (model);
12880 }
12881
12882 static void
12883 aarch64_init_libfuncs (void)
12884 {
12885    /* Half-precision float operations.  The compiler handles all operations
12886      with NULL libfuncs by converting to SFmode.  */
12887
12888   /* Conversions.  */
12889   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12890   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12891
12892   /* Arithmetic.  */
12893   set_optab_libfunc (add_optab, HFmode, NULL);
12894   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12895   set_optab_libfunc (smul_optab, HFmode, NULL);
12896   set_optab_libfunc (neg_optab, HFmode, NULL);
12897   set_optab_libfunc (sub_optab, HFmode, NULL);
12898
12899   /* Comparisons.  */
12900   set_optab_libfunc (eq_optab, HFmode, NULL);
12901   set_optab_libfunc (ne_optab, HFmode, NULL);
12902   set_optab_libfunc (lt_optab, HFmode, NULL);
12903   set_optab_libfunc (le_optab, HFmode, NULL);
12904   set_optab_libfunc (ge_optab, HFmode, NULL);
12905   set_optab_libfunc (gt_optab, HFmode, NULL);
12906   set_optab_libfunc (unord_optab, HFmode, NULL);
12907 }
12908
12909 /* Target hook for c_mode_for_suffix.  */
12910 static machine_mode
12911 aarch64_c_mode_for_suffix (char suffix)
12912 {
12913   if (suffix == 'q')
12914     return TFmode;
12915
12916   return VOIDmode;
12917 }
12918
12919 /* We can only represent floating point constants which will fit in
12920    "quarter-precision" values.  These values are characterised by
12921    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12922    by:
12923
12924    (-1)^s * (n/16) * 2^r
12925
12926    Where:
12927      's' is the sign bit.
12928      'n' is an integer in the range 16 <= n <= 31.
12929      'r' is an integer in the range -3 <= r <= 4.  */
12930
12931 /* Return true iff X can be represented by a quarter-precision
12932    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12933 bool
12934 aarch64_float_const_representable_p (rtx x)
12935 {
12936   /* This represents our current view of how many bits
12937      make up the mantissa.  */
12938   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12939   int exponent;
12940   unsigned HOST_WIDE_INT mantissa, mask;
12941   REAL_VALUE_TYPE r, m;
12942   bool fail;
12943
12944   if (!CONST_DOUBLE_P (x))
12945     return false;
12946
12947   /* We don't support HFmode constants yet.  */
12948   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12949     return false;
12950
12951   r = *CONST_DOUBLE_REAL_VALUE (x);
12952
12953   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12954      know if we have +zero until we analyse the mantissa, but we
12955      can reject the other invalid values.  */
12956   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12957       || REAL_VALUE_MINUS_ZERO (r))
12958     return false;
12959
12960   /* Extract exponent.  */
12961   r = real_value_abs (&r);
12962   exponent = REAL_EXP (&r);
12963
12964   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12965      highest (sign) bit, with a fixed binary point at bit point_pos.
12966      m1 holds the low part of the mantissa, m2 the high part.
12967      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12968      bits for the mantissa, this can fail (low bits will be lost).  */
12969   real_ldexp (&m, &r, point_pos - exponent);
12970   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12971
12972   /* If the low part of the mantissa has bits set we cannot represent
12973      the value.  */
12974   if (w.ulow () != 0)
12975     return false;
12976   /* We have rejected the lower HOST_WIDE_INT, so update our
12977      understanding of how many bits lie in the mantissa and
12978      look only at the high HOST_WIDE_INT.  */
12979   mantissa = w.elt (1);
12980   point_pos -= HOST_BITS_PER_WIDE_INT;
12981
12982   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12983   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12984   if ((mantissa & mask) != 0)
12985     return false;
12986
12987   /* Having filtered unrepresentable values, we may now remove all
12988      but the highest 5 bits.  */
12989   mantissa >>= point_pos - 5;
12990
12991   /* We cannot represent the value 0.0, so reject it.  This is handled
12992      elsewhere.  */
12993   if (mantissa == 0)
12994     return false;
12995
12996   /* Then, as bit 4 is always set, we can mask it off, leaving
12997      the mantissa in the range [0, 15].  */
12998   mantissa &= ~(1 << 4);
12999   gcc_assert (mantissa <= 15);
13000
13001   /* GCC internally does not use IEEE754-like encoding (where normalized
13002      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
13003      Our mantissa values are shifted 4 places to the left relative to
13004      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
13005      by 5 places to correct for GCC's representation.  */
13006   exponent = 5 - exponent;
13007
13008   return (exponent >= 0 && exponent <= 7);
13009 }
13010
13011 char*
13012 aarch64_output_simd_mov_immediate (rtx const_vector,
13013                                    machine_mode mode,
13014                                    unsigned width)
13015 {
13016   bool is_valid;
13017   static char templ[40];
13018   const char *mnemonic;
13019   const char *shift_op;
13020   unsigned int lane_count = 0;
13021   char element_char;
13022
13023   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13024
13025   /* This will return true to show const_vector is legal for use as either
13026      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
13027      also update INFO to show how the immediate should be generated.  */
13028   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13029   gcc_assert (is_valid);
13030
13031   element_char = sizetochar (info.element_width);
13032   lane_count = width / info.element_width;
13033
13034   mode = GET_MODE_INNER (mode);
13035   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13036     {
13037       gcc_assert (info.shift == 0 && ! info.mvn);
13038       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13039          move immediate path.  */
13040       if (aarch64_float_const_zero_rtx_p (info.value))
13041         info.value = GEN_INT (0);
13042       else
13043         {
13044           const unsigned int buf_size = 20;
13045           char float_buf[buf_size] = {'\0'};
13046           real_to_decimal_for_mode (float_buf,
13047                                     CONST_DOUBLE_REAL_VALUE (info.value),
13048                                     buf_size, buf_size, 1, mode);
13049
13050           if (lane_count == 1)
13051             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13052           else
13053             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13054                       lane_count, element_char, float_buf);
13055           return templ;
13056         }
13057     }
13058
13059   mnemonic = info.mvn ? "mvni" : "movi";
13060   shift_op = info.msl ? "msl" : "lsl";
13061
13062   gcc_assert (CONST_INT_P (info.value));
13063   if (lane_count == 1)
13064     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13065               mnemonic, UINTVAL (info.value));
13066   else if (info.shift)
13067     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13068               ", %s %d", mnemonic, lane_count, element_char,
13069               UINTVAL (info.value), shift_op, info.shift);
13070   else
13071     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13072               mnemonic, lane_count, element_char, UINTVAL (info.value));
13073   return templ;
13074 }
13075
13076 char*
13077 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13078 {
13079
13080   /* If a floating point number was passed and we desire to use it in an
13081      integer mode do the conversion to integer.  */
13082   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13083     {
13084       unsigned HOST_WIDE_INT ival;
13085       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13086           gcc_unreachable ();
13087       immediate = gen_int_mode (ival, mode);
13088     }
13089
13090   machine_mode vmode;
13091   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13092      a 128 bit vector mode.  */
13093   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13094
13095   vmode = aarch64_simd_container_mode (mode, width);
13096   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13097   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13098 }
13099
13100 /* Split operands into moves from op[1] + op[2] into op[0].  */
13101
13102 void
13103 aarch64_split_combinev16qi (rtx operands[3])
13104 {
13105   unsigned int dest = REGNO (operands[0]);
13106   unsigned int src1 = REGNO (operands[1]);
13107   unsigned int src2 = REGNO (operands[2]);
13108   machine_mode halfmode = GET_MODE (operands[1]);
13109   unsigned int halfregs = REG_NREGS (operands[1]);
13110   rtx destlo, desthi;
13111
13112   gcc_assert (halfmode == V16QImode);
13113
13114   if (src1 == dest && src2 == dest + halfregs)
13115     {
13116       /* No-op move.  Can't split to nothing; emit something.  */
13117       emit_note (NOTE_INSN_DELETED);
13118       return;
13119     }
13120
13121   /* Preserve register attributes for variable tracking.  */
13122   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13123   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13124                                GET_MODE_SIZE (halfmode));
13125
13126   /* Special case of reversed high/low parts.  */
13127   if (reg_overlap_mentioned_p (operands[2], destlo)
13128       && reg_overlap_mentioned_p (operands[1], desthi))
13129     {
13130       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13131       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13132       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13133     }
13134   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13135     {
13136       /* Try to avoid unnecessary moves if part of the result
13137          is in the right place already.  */
13138       if (src1 != dest)
13139         emit_move_insn (destlo, operands[1]);
13140       if (src2 != dest + halfregs)
13141         emit_move_insn (desthi, operands[2]);
13142     }
13143   else
13144     {
13145       if (src2 != dest + halfregs)
13146         emit_move_insn (desthi, operands[2]);
13147       if (src1 != dest)
13148         emit_move_insn (destlo, operands[1]);
13149     }
13150 }
13151
13152 /* vec_perm support.  */
13153
13154 #define MAX_VECT_LEN 16
13155
13156 struct expand_vec_perm_d
13157 {
13158   rtx target, op0, op1;
13159   unsigned char perm[MAX_VECT_LEN];
13160   machine_mode vmode;
13161   unsigned char nelt;
13162   bool one_vector_p;
13163   bool testing_p;
13164 };
13165
13166 /* Generate a variable permutation.  */
13167
13168 static void
13169 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13170 {
13171   machine_mode vmode = GET_MODE (target);
13172   bool one_vector_p = rtx_equal_p (op0, op1);
13173
13174   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13175   gcc_checking_assert (GET_MODE (op0) == vmode);
13176   gcc_checking_assert (GET_MODE (op1) == vmode);
13177   gcc_checking_assert (GET_MODE (sel) == vmode);
13178   gcc_checking_assert (TARGET_SIMD);
13179
13180   if (one_vector_p)
13181     {
13182       if (vmode == V8QImode)
13183         {
13184           /* Expand the argument to a V16QI mode by duplicating it.  */
13185           rtx pair = gen_reg_rtx (V16QImode);
13186           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13187           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13188         }
13189       else
13190         {
13191           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13192         }
13193     }
13194   else
13195     {
13196       rtx pair;
13197
13198       if (vmode == V8QImode)
13199         {
13200           pair = gen_reg_rtx (V16QImode);
13201           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13202           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13203         }
13204       else
13205         {
13206           pair = gen_reg_rtx (OImode);
13207           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13208           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13209         }
13210     }
13211 }
13212
13213 void
13214 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13215 {
13216   machine_mode vmode = GET_MODE (target);
13217   unsigned int nelt = GET_MODE_NUNITS (vmode);
13218   bool one_vector_p = rtx_equal_p (op0, op1);
13219   rtx mask;
13220
13221   /* The TBL instruction does not use a modulo index, so we must take care
13222      of that ourselves.  */
13223   mask = aarch64_simd_gen_const_vector_dup (vmode,
13224       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13225   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13226
13227   /* For big-endian, we also need to reverse the index within the vector
13228      (but not which vector).  */
13229   if (BYTES_BIG_ENDIAN)
13230     {
13231       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13232       if (!one_vector_p)
13233         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13234       sel = expand_simple_binop (vmode, XOR, sel, mask,
13235                                  NULL, 0, OPTAB_LIB_WIDEN);
13236     }
13237   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13238 }
13239
13240 /* Recognize patterns suitable for the TRN instructions.  */
13241 static bool
13242 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13243 {
13244   unsigned int i, odd, mask, nelt = d->nelt;
13245   rtx out, in0, in1, x;
13246   rtx (*gen) (rtx, rtx, rtx);
13247   machine_mode vmode = d->vmode;
13248
13249   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13250     return false;
13251
13252   /* Note that these are little-endian tests.
13253      We correct for big-endian later.  */
13254   if (d->perm[0] == 0)
13255     odd = 0;
13256   else if (d->perm[0] == 1)
13257     odd = 1;
13258   else
13259     return false;
13260   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13261
13262   for (i = 0; i < nelt; i += 2)
13263     {
13264       if (d->perm[i] != i + odd)
13265         return false;
13266       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13267         return false;
13268     }
13269
13270   /* Success!  */
13271   if (d->testing_p)
13272     return true;
13273
13274   in0 = d->op0;
13275   in1 = d->op1;
13276   if (BYTES_BIG_ENDIAN)
13277     {
13278       x = in0, in0 = in1, in1 = x;
13279       odd = !odd;
13280     }
13281   out = d->target;
13282
13283   if (odd)
13284     {
13285       switch (vmode)
13286         {
13287         case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13288         case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13289         case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13290         case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13291         case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13292         case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13293         case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13294         case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13295         case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13296         case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13297         case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13298         case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13299         default:
13300           return false;
13301         }
13302     }
13303   else
13304     {
13305       switch (vmode)
13306         {
13307         case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13308         case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13309         case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13310         case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13311         case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13312         case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13313         case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13314         case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13315         case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13316         case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13317         case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13318         case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13319         default:
13320           return false;
13321         }
13322     }
13323
13324   emit_insn (gen (out, in0, in1));
13325   return true;
13326 }
13327
13328 /* Recognize patterns suitable for the UZP instructions.  */
13329 static bool
13330 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13331 {
13332   unsigned int i, odd, mask, nelt = d->nelt;
13333   rtx out, in0, in1, x;
13334   rtx (*gen) (rtx, rtx, rtx);
13335   machine_mode vmode = d->vmode;
13336
13337   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13338     return false;
13339
13340   /* Note that these are little-endian tests.
13341      We correct for big-endian later.  */
13342   if (d->perm[0] == 0)
13343     odd = 0;
13344   else if (d->perm[0] == 1)
13345     odd = 1;
13346   else
13347     return false;
13348   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13349
13350   for (i = 0; i < nelt; i++)
13351     {
13352       unsigned elt = (i * 2 + odd) & mask;
13353       if (d->perm[i] != elt)
13354         return false;
13355     }
13356
13357   /* Success!  */
13358   if (d->testing_p)
13359     return true;
13360
13361   in0 = d->op0;
13362   in1 = d->op1;
13363   if (BYTES_BIG_ENDIAN)
13364     {
13365       x = in0, in0 = in1, in1 = x;
13366       odd = !odd;
13367     }
13368   out = d->target;
13369
13370   if (odd)
13371     {
13372       switch (vmode)
13373         {
13374         case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13375         case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13376         case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13377         case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13378         case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13379         case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13380         case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13381         case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13382         case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13383         case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13384         case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13385         case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13386         default:
13387           return false;
13388         }
13389     }
13390   else
13391     {
13392       switch (vmode)
13393         {
13394         case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13395         case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13396         case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13397         case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13398         case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13399         case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13400         case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13401         case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13402         case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13403         case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13404         case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13405         case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13406         default:
13407           return false;
13408         }
13409     }
13410
13411   emit_insn (gen (out, in0, in1));
13412   return true;
13413 }
13414
13415 /* Recognize patterns suitable for the ZIP instructions.  */
13416 static bool
13417 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13418 {
13419   unsigned int i, high, mask, nelt = d->nelt;
13420   rtx out, in0, in1, x;
13421   rtx (*gen) (rtx, rtx, rtx);
13422   machine_mode vmode = d->vmode;
13423
13424   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13425     return false;
13426
13427   /* Note that these are little-endian tests.
13428      We correct for big-endian later.  */
13429   high = nelt / 2;
13430   if (d->perm[0] == high)
13431     /* Do Nothing.  */
13432     ;
13433   else if (d->perm[0] == 0)
13434     high = 0;
13435   else
13436     return false;
13437   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13438
13439   for (i = 0; i < nelt / 2; i++)
13440     {
13441       unsigned elt = (i + high) & mask;
13442       if (d->perm[i * 2] != elt)
13443         return false;
13444       elt = (elt + nelt) & mask;
13445       if (d->perm[i * 2 + 1] != elt)
13446         return false;
13447     }
13448
13449   /* Success!  */
13450   if (d->testing_p)
13451     return true;
13452
13453   in0 = d->op0;
13454   in1 = d->op1;
13455   if (BYTES_BIG_ENDIAN)
13456     {
13457       x = in0, in0 = in1, in1 = x;
13458       high = !high;
13459     }
13460   out = d->target;
13461
13462   if (high)
13463     {
13464       switch (vmode)
13465         {
13466         case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13467         case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13468         case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13469         case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13470         case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13471         case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13472         case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13473         case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13474         case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13475         case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13476         case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13477         case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13478         default:
13479           return false;
13480         }
13481     }
13482   else
13483     {
13484       switch (vmode)
13485         {
13486         case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13487         case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13488         case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13489         case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13490         case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13491         case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13492         case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13493         case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13494         case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13495         case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13496         case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13497         case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13498         default:
13499           return false;
13500         }
13501     }
13502
13503   emit_insn (gen (out, in0, in1));
13504   return true;
13505 }
13506
13507 /* Recognize patterns for the EXT insn.  */
13508
13509 static bool
13510 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13511 {
13512   unsigned int i, nelt = d->nelt;
13513   rtx (*gen) (rtx, rtx, rtx, rtx);
13514   rtx offset;
13515
13516   unsigned int location = d->perm[0]; /* Always < nelt.  */
13517
13518   /* Check if the extracted indices are increasing by one.  */
13519   for (i = 1; i < nelt; i++)
13520     {
13521       unsigned int required = location + i;
13522       if (d->one_vector_p)
13523         {
13524           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13525           required &= (nelt - 1);
13526         }
13527       if (d->perm[i] != required)
13528         return false;
13529     }
13530
13531   switch (d->vmode)
13532     {
13533     case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13534     case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13535     case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13536     case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13537     case E_V2SImode: gen = gen_aarch64_extv2si; break;
13538     case E_V4SImode: gen = gen_aarch64_extv4si; break;
13539     case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13540     case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13541     case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13542     case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13543     case E_V2DImode: gen = gen_aarch64_extv2di; break;
13544     case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13545     default:
13546       return false;
13547     }
13548
13549   /* Success! */
13550   if (d->testing_p)
13551     return true;
13552
13553   /* The case where (location == 0) is a no-op for both big- and little-endian,
13554      and is removed by the mid-end at optimization levels -O1 and higher.  */
13555
13556   if (BYTES_BIG_ENDIAN && (location != 0))
13557     {
13558       /* After setup, we want the high elements of the first vector (stored
13559          at the LSB end of the register), and the low elements of the second
13560          vector (stored at the MSB end of the register). So swap.  */
13561       std::swap (d->op0, d->op1);
13562       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13563       location = nelt - location;
13564     }
13565
13566   offset = GEN_INT (location);
13567   emit_insn (gen (d->target, d->op0, d->op1, offset));
13568   return true;
13569 }
13570
13571 /* Recognize patterns for the REV insns.  */
13572
13573 static bool
13574 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13575 {
13576   unsigned int i, j, diff, nelt = d->nelt;
13577   rtx (*gen) (rtx, rtx);
13578
13579   if (!d->one_vector_p)
13580     return false;
13581
13582   diff = d->perm[0];
13583   switch (diff)
13584     {
13585     case 7:
13586       switch (d->vmode)
13587         {
13588         case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13589         case E_V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13590         default:
13591           return false;
13592         }
13593       break;
13594     case 3:
13595       switch (d->vmode)
13596         {
13597         case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13598         case E_V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13599         case E_V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13600         case E_V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13601         default:
13602           return false;
13603         }
13604       break;
13605     case 1:
13606       switch (d->vmode)
13607         {
13608         case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13609         case E_V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13610         case E_V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13611         case E_V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13612         case E_V4SImode: gen = gen_aarch64_rev64v4si;  break;
13613         case E_V2SImode: gen = gen_aarch64_rev64v2si;  break;
13614         case E_V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13615         case E_V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13616         case E_V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13617         case E_V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13618         default:
13619           return false;
13620         }
13621       break;
13622     default:
13623       return false;
13624     }
13625
13626   for (i = 0; i < nelt ; i += diff + 1)
13627     for (j = 0; j <= diff; j += 1)
13628       {
13629         /* This is guaranteed to be true as the value of diff
13630            is 7, 3, 1 and we should have enough elements in the
13631            queue to generate this.  Getting a vector mask with a
13632            value of diff other than these values implies that
13633            something is wrong by the time we get here.  */
13634         gcc_assert (i + j < nelt);
13635         if (d->perm[i + j] != i + diff - j)
13636           return false;
13637       }
13638
13639   /* Success! */
13640   if (d->testing_p)
13641     return true;
13642
13643   emit_insn (gen (d->target, d->op0));
13644   return true;
13645 }
13646
13647 static bool
13648 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13649 {
13650   rtx (*gen) (rtx, rtx, rtx);
13651   rtx out = d->target;
13652   rtx in0;
13653   machine_mode vmode = d->vmode;
13654   unsigned int i, elt, nelt = d->nelt;
13655   rtx lane;
13656
13657   elt = d->perm[0];
13658   for (i = 1; i < nelt; i++)
13659     {
13660       if (elt != d->perm[i])
13661         return false;
13662     }
13663
13664   /* The generic preparation in aarch64_expand_vec_perm_const_1
13665      swaps the operand order and the permute indices if it finds
13666      d->perm[0] to be in the second operand.  Thus, we can always
13667      use d->op0 and need not do any extra arithmetic to get the
13668      correct lane number.  */
13669   in0 = d->op0;
13670   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13671
13672   switch (vmode)
13673     {
13674     case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13675     case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13676     case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13677     case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13678     case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13679     case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13680     case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13681     case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13682     case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13683     case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13684     case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13685     case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13686     default:
13687       return false;
13688     }
13689
13690   emit_insn (gen (out, in0, lane));
13691   return true;
13692 }
13693
13694 static bool
13695 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13696 {
13697   rtx rperm[MAX_VECT_LEN], sel;
13698   machine_mode vmode = d->vmode;
13699   unsigned int i, nelt = d->nelt;
13700
13701   if (d->testing_p)
13702     return true;
13703
13704   /* Generic code will try constant permutation twice.  Once with the
13705      original mode and again with the elements lowered to QImode.
13706      So wait and don't do the selector expansion ourselves.  */
13707   if (vmode != V8QImode && vmode != V16QImode)
13708     return false;
13709
13710   for (i = 0; i < nelt; ++i)
13711     {
13712       int nunits = GET_MODE_NUNITS (vmode);
13713
13714       /* If big-endian and two vectors we end up with a weird mixed-endian
13715          mode on NEON.  Reverse the index within each word but not the word
13716          itself.  */
13717       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13718                                            : d->perm[i]);
13719     }
13720   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13721   sel = force_reg (vmode, sel);
13722
13723   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13724   return true;
13725 }
13726
13727 static bool
13728 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13729 {
13730   /* The pattern matching functions above are written to look for a small
13731      number to begin the sequence (0, 1, N/2).  If we begin with an index
13732      from the second operand, we can swap the operands.  */
13733   if (d->perm[0] >= d->nelt)
13734     {
13735       unsigned i, nelt = d->nelt;
13736
13737       gcc_assert (nelt == (nelt & -nelt));
13738       for (i = 0; i < nelt; ++i)
13739         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13740
13741       std::swap (d->op0, d->op1);
13742     }
13743
13744   if (TARGET_SIMD)
13745     {
13746       if (aarch64_evpc_rev (d))
13747         return true;
13748       else if (aarch64_evpc_ext (d))
13749         return true;
13750       else if (aarch64_evpc_dup (d))
13751         return true;
13752       else if (aarch64_evpc_zip (d))
13753         return true;
13754       else if (aarch64_evpc_uzp (d))
13755         return true;
13756       else if (aarch64_evpc_trn (d))
13757         return true;
13758       return aarch64_evpc_tbl (d);
13759     }
13760   return false;
13761 }
13762
13763 /* Expand a vec_perm_const pattern.  */
13764
13765 bool
13766 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13767 {
13768   struct expand_vec_perm_d d;
13769   int i, nelt, which;
13770
13771   d.target = target;
13772   d.op0 = op0;
13773   d.op1 = op1;
13774
13775   d.vmode = GET_MODE (target);
13776   gcc_assert (VECTOR_MODE_P (d.vmode));
13777   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13778   d.testing_p = false;
13779
13780   for (i = which = 0; i < nelt; ++i)
13781     {
13782       rtx e = XVECEXP (sel, 0, i);
13783       int ei = INTVAL (e) & (2 * nelt - 1);
13784       which |= (ei < nelt ? 1 : 2);
13785       d.perm[i] = ei;
13786     }
13787
13788   switch (which)
13789     {
13790     default:
13791       gcc_unreachable ();
13792
13793     case 3:
13794       d.one_vector_p = false;
13795       if (!rtx_equal_p (op0, op1))
13796         break;
13797
13798       /* The elements of PERM do not suggest that only the first operand
13799          is used, but both operands are identical.  Allow easier matching
13800          of the permutation by folding the permutation into the single
13801          input vector.  */
13802       /* Fall Through.  */
13803     case 2:
13804       for (i = 0; i < nelt; ++i)
13805         d.perm[i] &= nelt - 1;
13806       d.op0 = op1;
13807       d.one_vector_p = true;
13808       break;
13809
13810     case 1:
13811       d.op1 = op0;
13812       d.one_vector_p = true;
13813       break;
13814     }
13815
13816   return aarch64_expand_vec_perm_const_1 (&d);
13817 }
13818
13819 static bool
13820 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13821                                      const unsigned char *sel)
13822 {
13823   struct expand_vec_perm_d d;
13824   unsigned int i, nelt, which;
13825   bool ret;
13826
13827   d.vmode = vmode;
13828   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13829   d.testing_p = true;
13830   memcpy (d.perm, sel, nelt);
13831
13832   /* Calculate whether all elements are in one vector.  */
13833   for (i = which = 0; i < nelt; ++i)
13834     {
13835       unsigned char e = d.perm[i];
13836       gcc_assert (e < 2 * nelt);
13837       which |= (e < nelt ? 1 : 2);
13838     }
13839
13840   /* If all elements are from the second vector, reindex as if from the
13841      first vector.  */
13842   if (which == 2)
13843     for (i = 0; i < nelt; ++i)
13844       d.perm[i] -= nelt;
13845
13846   /* Check whether the mask can be applied to a single vector.  */
13847   d.one_vector_p = (which != 3);
13848
13849   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13850   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13851   if (!d.one_vector_p)
13852     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13853
13854   start_sequence ();
13855   ret = aarch64_expand_vec_perm_const_1 (&d);
13856   end_sequence ();
13857
13858   return ret;
13859 }
13860
13861 rtx
13862 aarch64_reverse_mask (machine_mode mode)
13863 {
13864   /* We have to reverse each vector because we dont have
13865      a permuted load that can reverse-load according to ABI rules.  */
13866   rtx mask;
13867   rtvec v = rtvec_alloc (16);
13868   int i, j;
13869   int nunits = GET_MODE_NUNITS (mode);
13870   int usize = GET_MODE_UNIT_SIZE (mode);
13871
13872   gcc_assert (BYTES_BIG_ENDIAN);
13873   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13874
13875   for (i = 0; i < nunits; i++)
13876     for (j = 0; j < usize; j++)
13877       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13878   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13879   return force_reg (V16QImode, mask);
13880 }
13881
13882 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
13883    true.  However due to issues with register allocation it is preferable
13884    to avoid tieing integer scalar and FP scalar modes.  Executing integer
13885    operations in general registers is better than treating them as scalar
13886    vector operations.  This reduces latency and avoids redundant int<->FP
13887    moves.  So tie modes if they are either the same class, or vector modes
13888    with other vector modes, vector structs or any scalar mode.  */
13889
13890 static bool
13891 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13892 {
13893   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13894     return true;
13895
13896   /* We specifically want to allow elements of "structure" modes to
13897      be tieable to the structure.  This more general condition allows
13898      other rarer situations too.  */
13899   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13900     return true;
13901
13902   /* Also allow any scalar modes with vectors.  */
13903   if (aarch64_vector_mode_supported_p (mode1)
13904       || aarch64_vector_mode_supported_p (mode2))
13905     return true;
13906
13907   return false;
13908 }
13909
13910 /* Return a new RTX holding the result of moving POINTER forward by
13911    AMOUNT bytes.  */
13912
13913 static rtx
13914 aarch64_move_pointer (rtx pointer, int amount)
13915 {
13916   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13917
13918   return adjust_automodify_address (pointer, GET_MODE (pointer),
13919                                     next, amount);
13920 }
13921
13922 /* Return a new RTX holding the result of moving POINTER forward by the
13923    size of the mode it points to.  */
13924
13925 static rtx
13926 aarch64_progress_pointer (rtx pointer)
13927 {
13928   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13929
13930   return aarch64_move_pointer (pointer, amount);
13931 }
13932
13933 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13934    MODE bytes.  */
13935
13936 static void
13937 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13938                                               machine_mode mode)
13939 {
13940   rtx reg = gen_reg_rtx (mode);
13941
13942   /* "Cast" the pointers to the correct mode.  */
13943   *src = adjust_address (*src, mode, 0);
13944   *dst = adjust_address (*dst, mode, 0);
13945   /* Emit the memcpy.  */
13946   emit_move_insn (reg, *src);
13947   emit_move_insn (*dst, reg);
13948   /* Move the pointers forward.  */
13949   *src = aarch64_progress_pointer (*src);
13950   *dst = aarch64_progress_pointer (*dst);
13951 }
13952
13953 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13954    we succeed, otherwise return false.  */
13955
13956 bool
13957 aarch64_expand_movmem (rtx *operands)
13958 {
13959   unsigned int n;
13960   rtx dst = operands[0];
13961   rtx src = operands[1];
13962   rtx base;
13963   bool speed_p = !optimize_function_for_size_p (cfun);
13964
13965   /* When optimizing for size, give a better estimate of the length of a
13966      memcpy call, but use the default otherwise.  */
13967   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13968
13969   /* We can't do anything smart if the amount to copy is not constant.  */
13970   if (!CONST_INT_P (operands[2]))
13971     return false;
13972
13973   n = UINTVAL (operands[2]);
13974
13975   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13976      need to make at most two moves.  For cases above 16 bytes it will be one
13977      move for each 16 byte chunk, then at most two additional moves.  */
13978   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13979     return false;
13980
13981   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13982   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13983
13984   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13985   src = adjust_automodify_address (src, VOIDmode, base, 0);
13986
13987   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13988      1-byte chunk.  */
13989   if (n < 4)
13990     {
13991       if (n >= 2)
13992         {
13993           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13994           n -= 2;
13995         }
13996
13997       if (n == 1)
13998         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13999
14000       return true;
14001     }
14002
14003   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
14004      4-byte chunk, partially overlapping with the previously copied chunk.  */
14005   if (n < 8)
14006     {
14007       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14008       n -= 4;
14009       if (n > 0)
14010         {
14011           int move = n - 4;
14012
14013           src = aarch64_move_pointer (src, move);
14014           dst = aarch64_move_pointer (dst, move);
14015           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14016         }
14017       return true;
14018     }
14019
14020   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
14021      them, then (if applicable) an 8-byte chunk.  */
14022   while (n >= 8)
14023     {
14024       if (n / 16)
14025         {
14026           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14027           n -= 16;
14028         }
14029       else
14030         {
14031           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14032           n -= 8;
14033         }
14034     }
14035
14036   /* Finish the final bytes of the copy.  We can always do this in one
14037      instruction.  We either copy the exact amount we need, or partially
14038      overlap with the previous chunk we copied and copy 8-bytes.  */
14039   if (n == 0)
14040     return true;
14041   else if (n == 1)
14042     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14043   else if (n == 2)
14044     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14045   else if (n == 4)
14046     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14047   else
14048     {
14049       if (n == 3)
14050         {
14051           src = aarch64_move_pointer (src, -1);
14052           dst = aarch64_move_pointer (dst, -1);
14053           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14054         }
14055       else
14056         {
14057           int move = n - 8;
14058
14059           src = aarch64_move_pointer (src, move);
14060           dst = aarch64_move_pointer (dst, move);
14061           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14062         }
14063     }
14064
14065   return true;
14066 }
14067
14068 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14069    SImode stores.  Handle the case when the constant has identical
14070    bottom and top halves.  This is beneficial when the two stores can be
14071    merged into an STP and we avoid synthesising potentially expensive
14072    immediates twice.  Return true if such a split is possible.  */
14073
14074 bool
14075 aarch64_split_dimode_const_store (rtx dst, rtx src)
14076 {
14077   rtx lo = gen_lowpart (SImode, src);
14078   rtx hi = gen_highpart_mode (SImode, DImode, src);
14079
14080   bool size_p = optimize_function_for_size_p (cfun);
14081
14082   if (!rtx_equal_p (lo, hi))
14083     return false;
14084
14085   unsigned int orig_cost
14086     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14087   unsigned int lo_cost
14088     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14089
14090   /* We want to transform:
14091      MOV        x1, 49370
14092      MOVK       x1, 0x140, lsl 16
14093      MOVK       x1, 0xc0da, lsl 32
14094      MOVK       x1, 0x140, lsl 48
14095      STR        x1, [x0]
14096    into:
14097      MOV        w1, 49370
14098      MOVK       w1, 0x140, lsl 16
14099      STP        w1, w1, [x0]
14100    So we want to perform this only when we save two instructions
14101    or more.  When optimizing for size, however, accept any code size
14102    savings we can.  */
14103   if (size_p && orig_cost <= lo_cost)
14104     return false;
14105
14106   if (!size_p
14107       && (orig_cost <= lo_cost + 1))
14108     return false;
14109
14110   rtx mem_lo = adjust_address (dst, SImode, 0);
14111   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14112     return false;
14113
14114   rtx tmp_reg = gen_reg_rtx (SImode);
14115   aarch64_expand_mov_immediate (tmp_reg, lo);
14116   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14117   /* Don't emit an explicit store pair as this may not be always profitable.
14118      Let the sched-fusion logic decide whether to merge them.  */
14119   emit_move_insn (mem_lo, tmp_reg);
14120   emit_move_insn (mem_hi, tmp_reg);
14121
14122   return true;
14123 }
14124
14125 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14126
14127 static unsigned HOST_WIDE_INT
14128 aarch64_asan_shadow_offset (void)
14129 {
14130   return (HOST_WIDE_INT_1 << 36);
14131 }
14132
14133 static bool
14134 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14135                                         unsigned int align,
14136                                         enum by_pieces_operation op,
14137                                         bool speed_p)
14138 {
14139   /* STORE_BY_PIECES can be used when copying a constant string, but
14140      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14141      For now we always fail this and let the move_by_pieces code copy
14142      the string from read-only memory.  */
14143   if (op == STORE_BY_PIECES)
14144     return false;
14145
14146   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14147 }
14148
14149 static rtx
14150 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14151                         int code, tree treeop0, tree treeop1)
14152 {
14153   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14154   rtx op0, op1;
14155   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14156   insn_code icode;
14157   struct expand_operand ops[4];
14158
14159   start_sequence ();
14160   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14161
14162   op_mode = GET_MODE (op0);
14163   if (op_mode == VOIDmode)
14164     op_mode = GET_MODE (op1);
14165
14166   switch (op_mode)
14167     {
14168     case E_QImode:
14169     case E_HImode:
14170     case E_SImode:
14171       cmp_mode = SImode;
14172       icode = CODE_FOR_cmpsi;
14173       break;
14174
14175     case E_DImode:
14176       cmp_mode = DImode;
14177       icode = CODE_FOR_cmpdi;
14178       break;
14179
14180     case E_SFmode:
14181       cmp_mode = SFmode;
14182       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14183       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14184       break;
14185
14186     case E_DFmode:
14187       cmp_mode = DFmode;
14188       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14189       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14190       break;
14191
14192     default:
14193       end_sequence ();
14194       return NULL_RTX;
14195     }
14196
14197   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14198   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14199   if (!op0 || !op1)
14200     {
14201       end_sequence ();
14202       return NULL_RTX;
14203     }
14204   *prep_seq = get_insns ();
14205   end_sequence ();
14206
14207   create_fixed_operand (&ops[0], op0);
14208   create_fixed_operand (&ops[1], op1);
14209
14210   start_sequence ();
14211   if (!maybe_expand_insn (icode, 2, ops))
14212     {
14213       end_sequence ();
14214       return NULL_RTX;
14215     }
14216   *gen_seq = get_insns ();
14217   end_sequence ();
14218
14219   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14220                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14221 }
14222
14223 static rtx
14224 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14225                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14226 {
14227   rtx op0, op1, target;
14228   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14229   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14230   insn_code icode;
14231   struct expand_operand ops[6];
14232   int aarch64_cond;
14233
14234   push_to_sequence (*prep_seq);
14235   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14236
14237   op_mode = GET_MODE (op0);
14238   if (op_mode == VOIDmode)
14239     op_mode = GET_MODE (op1);
14240
14241   switch (op_mode)
14242     {
14243     case E_QImode:
14244     case E_HImode:
14245     case E_SImode:
14246       cmp_mode = SImode;
14247       icode = CODE_FOR_ccmpsi;
14248       break;
14249
14250     case E_DImode:
14251       cmp_mode = DImode;
14252       icode = CODE_FOR_ccmpdi;
14253       break;
14254
14255     case E_SFmode:
14256       cmp_mode = SFmode;
14257       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14258       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14259       break;
14260
14261     case E_DFmode:
14262       cmp_mode = DFmode;
14263       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14264       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14265       break;
14266
14267     default:
14268       end_sequence ();
14269       return NULL_RTX;
14270     }
14271
14272   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14273   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14274   if (!op0 || !op1)
14275     {
14276       end_sequence ();
14277       return NULL_RTX;
14278     }
14279   *prep_seq = get_insns ();
14280   end_sequence ();
14281
14282   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14283   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14284
14285   if (bit_code != AND)
14286     {
14287       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14288                                                 GET_MODE (XEXP (prev, 0))),
14289                              VOIDmode, XEXP (prev, 0), const0_rtx);
14290       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14291     }
14292
14293   create_fixed_operand (&ops[0], XEXP (prev, 0));
14294   create_fixed_operand (&ops[1], target);
14295   create_fixed_operand (&ops[2], op0);
14296   create_fixed_operand (&ops[3], op1);
14297   create_fixed_operand (&ops[4], prev);
14298   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14299
14300   push_to_sequence (*gen_seq);
14301   if (!maybe_expand_insn (icode, 6, ops))
14302     {
14303       end_sequence ();
14304       return NULL_RTX;
14305     }
14306
14307   *gen_seq = get_insns ();
14308   end_sequence ();
14309
14310   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14311 }
14312
14313 #undef TARGET_GEN_CCMP_FIRST
14314 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14315
14316 #undef TARGET_GEN_CCMP_NEXT
14317 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14318
14319 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14320    instruction fusion of some sort.  */
14321
14322 static bool
14323 aarch64_macro_fusion_p (void)
14324 {
14325   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14326 }
14327
14328
14329 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14330    should be kept together during scheduling.  */
14331
14332 static bool
14333 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14334 {
14335   rtx set_dest;
14336   rtx prev_set = single_set (prev);
14337   rtx curr_set = single_set (curr);
14338   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14339   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14340
14341   if (!aarch64_macro_fusion_p ())
14342     return false;
14343
14344   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14345     {
14346       /* We are trying to match:
14347          prev (mov)  == (set (reg r0) (const_int imm16))
14348          curr (movk) == (set (zero_extract (reg r0)
14349                                            (const_int 16)
14350                                            (const_int 16))
14351                              (const_int imm16_1))  */
14352
14353       set_dest = SET_DEST (curr_set);
14354
14355       if (GET_CODE (set_dest) == ZERO_EXTRACT
14356           && CONST_INT_P (SET_SRC (curr_set))
14357           && CONST_INT_P (SET_SRC (prev_set))
14358           && CONST_INT_P (XEXP (set_dest, 2))
14359           && INTVAL (XEXP (set_dest, 2)) == 16
14360           && REG_P (XEXP (set_dest, 0))
14361           && REG_P (SET_DEST (prev_set))
14362           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14363         {
14364           return true;
14365         }
14366     }
14367
14368   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14369     {
14370
14371       /*  We're trying to match:
14372           prev (adrp) == (set (reg r1)
14373                               (high (symbol_ref ("SYM"))))
14374           curr (add) == (set (reg r0)
14375                              (lo_sum (reg r1)
14376                                      (symbol_ref ("SYM"))))
14377           Note that r0 need not necessarily be the same as r1, especially
14378           during pre-regalloc scheduling.  */
14379
14380       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14381           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14382         {
14383           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14384               && REG_P (XEXP (SET_SRC (curr_set), 0))
14385               && REGNO (XEXP (SET_SRC (curr_set), 0))
14386                  == REGNO (SET_DEST (prev_set))
14387               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14388                               XEXP (SET_SRC (curr_set), 1)))
14389             return true;
14390         }
14391     }
14392
14393   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14394     {
14395
14396       /* We're trying to match:
14397          prev (movk) == (set (zero_extract (reg r0)
14398                                            (const_int 16)
14399                                            (const_int 32))
14400                              (const_int imm16_1))
14401          curr (movk) == (set (zero_extract (reg r0)
14402                                            (const_int 16)
14403                                            (const_int 48))
14404                              (const_int imm16_2))  */
14405
14406       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14407           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14408           && REG_P (XEXP (SET_DEST (prev_set), 0))
14409           && REG_P (XEXP (SET_DEST (curr_set), 0))
14410           && REGNO (XEXP (SET_DEST (prev_set), 0))
14411              == REGNO (XEXP (SET_DEST (curr_set), 0))
14412           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14413           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14414           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14415           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14416           && CONST_INT_P (SET_SRC (prev_set))
14417           && CONST_INT_P (SET_SRC (curr_set)))
14418         return true;
14419
14420     }
14421   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14422     {
14423       /* We're trying to match:
14424           prev (adrp) == (set (reg r0)
14425                               (high (symbol_ref ("SYM"))))
14426           curr (ldr) == (set (reg r1)
14427                              (mem (lo_sum (reg r0)
14428                                              (symbol_ref ("SYM")))))
14429                  or
14430           curr (ldr) == (set (reg r1)
14431                              (zero_extend (mem
14432                                            (lo_sum (reg r0)
14433                                                    (symbol_ref ("SYM"))))))  */
14434       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14435           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14436         {
14437           rtx curr_src = SET_SRC (curr_set);
14438
14439           if (GET_CODE (curr_src) == ZERO_EXTEND)
14440             curr_src = XEXP (curr_src, 0);
14441
14442           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14443               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14444               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14445                  == REGNO (SET_DEST (prev_set))
14446               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14447                               XEXP (SET_SRC (prev_set), 0)))
14448               return true;
14449         }
14450     }
14451
14452   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14453        && aarch_crypto_can_dual_issue (prev, curr))
14454     return true;
14455
14456   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14457       && any_condjump_p (curr))
14458     {
14459       enum attr_type prev_type = get_attr_type (prev);
14460
14461       unsigned int condreg1, condreg2;
14462       rtx cc_reg_1;
14463       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14464       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14465
14466       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14467           && prev
14468           && modified_in_p (cc_reg_1, prev))
14469         {
14470           /* FIXME: this misses some which is considered simple arthematic
14471              instructions for ThunderX.  Simple shifts are missed here.  */
14472           if (prev_type == TYPE_ALUS_SREG
14473               || prev_type == TYPE_ALUS_IMM
14474               || prev_type == TYPE_LOGICS_REG
14475               || prev_type == TYPE_LOGICS_IMM)
14476             return true;
14477         }
14478     }
14479
14480   if (prev_set
14481       && curr_set
14482       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14483       && any_condjump_p (curr))
14484     {
14485       /* We're trying to match:
14486           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14487           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14488                                                          (const_int 0))
14489                                                  (label_ref ("SYM"))
14490                                                  (pc))  */
14491       if (SET_DEST (curr_set) == (pc_rtx)
14492           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14493           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14494           && REG_P (SET_DEST (prev_set))
14495           && REGNO (SET_DEST (prev_set))
14496              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14497         {
14498           /* Fuse ALU operations followed by conditional branch instruction.  */
14499           switch (get_attr_type (prev))
14500             {
14501             case TYPE_ALU_IMM:
14502             case TYPE_ALU_SREG:
14503             case TYPE_ADC_REG:
14504             case TYPE_ADC_IMM:
14505             case TYPE_ADCS_REG:
14506             case TYPE_ADCS_IMM:
14507             case TYPE_LOGIC_REG:
14508             case TYPE_LOGIC_IMM:
14509             case TYPE_CSEL:
14510             case TYPE_ADR:
14511             case TYPE_MOV_IMM:
14512             case TYPE_SHIFT_REG:
14513             case TYPE_SHIFT_IMM:
14514             case TYPE_BFM:
14515             case TYPE_RBIT:
14516             case TYPE_REV:
14517             case TYPE_EXTEND:
14518               return true;
14519
14520             default:;
14521             }
14522         }
14523     }
14524
14525   return false;
14526 }
14527
14528 /* Return true iff the instruction fusion described by OP is enabled.  */
14529
14530 bool
14531 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14532 {
14533   return (aarch64_tune_params.fusible_ops & op) != 0;
14534 }
14535
14536 /* If MEM is in the form of [base+offset], extract the two parts
14537    of address and set to BASE and OFFSET, otherwise return false
14538    after clearing BASE and OFFSET.  */
14539
14540 bool
14541 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14542 {
14543   rtx addr;
14544
14545   gcc_assert (MEM_P (mem));
14546
14547   addr = XEXP (mem, 0);
14548
14549   if (REG_P (addr))
14550     {
14551       *base = addr;
14552       *offset = const0_rtx;
14553       return true;
14554     }
14555
14556   if (GET_CODE (addr) == PLUS
14557       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14558     {
14559       *base = XEXP (addr, 0);
14560       *offset = XEXP (addr, 1);
14561       return true;
14562     }
14563
14564   *base = NULL_RTX;
14565   *offset = NULL_RTX;
14566
14567   return false;
14568 }
14569
14570 /* Types for scheduling fusion.  */
14571 enum sched_fusion_type
14572 {
14573   SCHED_FUSION_NONE = 0,
14574   SCHED_FUSION_LD_SIGN_EXTEND,
14575   SCHED_FUSION_LD_ZERO_EXTEND,
14576   SCHED_FUSION_LD,
14577   SCHED_FUSION_ST,
14578   SCHED_FUSION_NUM
14579 };
14580
14581 /* If INSN is a load or store of address in the form of [base+offset],
14582    extract the two parts and set to BASE and OFFSET.  Return scheduling
14583    fusion type this INSN is.  */
14584
14585 static enum sched_fusion_type
14586 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14587 {
14588   rtx x, dest, src;
14589   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14590
14591   gcc_assert (INSN_P (insn));
14592   x = PATTERN (insn);
14593   if (GET_CODE (x) != SET)
14594     return SCHED_FUSION_NONE;
14595
14596   src = SET_SRC (x);
14597   dest = SET_DEST (x);
14598
14599   machine_mode dest_mode = GET_MODE (dest);
14600
14601   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14602     return SCHED_FUSION_NONE;
14603
14604   if (GET_CODE (src) == SIGN_EXTEND)
14605     {
14606       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14607       src = XEXP (src, 0);
14608       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14609         return SCHED_FUSION_NONE;
14610     }
14611   else if (GET_CODE (src) == ZERO_EXTEND)
14612     {
14613       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14614       src = XEXP (src, 0);
14615       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14616         return SCHED_FUSION_NONE;
14617     }
14618
14619   if (GET_CODE (src) == MEM && REG_P (dest))
14620     extract_base_offset_in_addr (src, base, offset);
14621   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14622     {
14623       fusion = SCHED_FUSION_ST;
14624       extract_base_offset_in_addr (dest, base, offset);
14625     }
14626   else
14627     return SCHED_FUSION_NONE;
14628
14629   if (*base == NULL_RTX || *offset == NULL_RTX)
14630     fusion = SCHED_FUSION_NONE;
14631
14632   return fusion;
14633 }
14634
14635 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14636
14637    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14638    and PRI are only calculated for these instructions.  For other instruction,
14639    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14640    type instruction fusion can be added by returning different priorities.
14641
14642    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14643
14644 static void
14645 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14646                                int *fusion_pri, int *pri)
14647 {
14648   int tmp, off_val;
14649   rtx base, offset;
14650   enum sched_fusion_type fusion;
14651
14652   gcc_assert (INSN_P (insn));
14653
14654   tmp = max_pri - 1;
14655   fusion = fusion_load_store (insn, &base, &offset);
14656   if (fusion == SCHED_FUSION_NONE)
14657     {
14658       *pri = tmp;
14659       *fusion_pri = tmp;
14660       return;
14661     }
14662
14663   /* Set FUSION_PRI according to fusion type and base register.  */
14664   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14665
14666   /* Calculate PRI.  */
14667   tmp /= 2;
14668
14669   /* INSN with smaller offset goes first.  */
14670   off_val = (int)(INTVAL (offset));
14671   if (off_val >= 0)
14672     tmp -= (off_val & 0xfffff);
14673   else
14674     tmp += ((- off_val) & 0xfffff);
14675
14676   *pri = tmp;
14677   return;
14678 }
14679
14680 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14681    Adjust priority of sha1h instructions so they are scheduled before
14682    other SHA1 instructions.  */
14683
14684 static int
14685 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14686 {
14687   rtx x = PATTERN (insn);
14688
14689   if (GET_CODE (x) == SET)
14690     {
14691       x = SET_SRC (x);
14692
14693       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14694         return priority + 10;
14695     }
14696
14697   return priority;
14698 }
14699
14700 /* Given OPERANDS of consecutive load/store, check if we can merge
14701    them into ldp/stp.  LOAD is true if they are load instructions.
14702    MODE is the mode of memory operands.  */
14703
14704 bool
14705 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14706                                 machine_mode mode)
14707 {
14708   HOST_WIDE_INT offval_1, offval_2, msize;
14709   enum reg_class rclass_1, rclass_2;
14710   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14711
14712   if (load)
14713     {
14714       mem_1 = operands[1];
14715       mem_2 = operands[3];
14716       reg_1 = operands[0];
14717       reg_2 = operands[2];
14718       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14719       if (REGNO (reg_1) == REGNO (reg_2))
14720         return false;
14721     }
14722   else
14723     {
14724       mem_1 = operands[0];
14725       mem_2 = operands[2];
14726       reg_1 = operands[1];
14727       reg_2 = operands[3];
14728     }
14729
14730   /* The mems cannot be volatile.  */
14731   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14732     return false;
14733
14734   /* If we have SImode and slow unaligned ldp,
14735      check the alignment to be at least 8 byte. */
14736   if (mode == SImode
14737       && (aarch64_tune_params.extra_tuning_flags
14738           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14739       && !optimize_size
14740       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14741     return false;
14742
14743   /* Check if the addresses are in the form of [base+offset].  */
14744   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14745   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14746     return false;
14747   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14748   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14749     return false;
14750
14751   /* Check if the bases are same.  */
14752   if (!rtx_equal_p (base_1, base_2))
14753     return false;
14754
14755   offval_1 = INTVAL (offset_1);
14756   offval_2 = INTVAL (offset_2);
14757   msize = GET_MODE_SIZE (mode);
14758   /* Check if the offsets are consecutive.  */
14759   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14760     return false;
14761
14762   /* Check if the addresses are clobbered by load.  */
14763   if (load)
14764     {
14765       if (reg_mentioned_p (reg_1, mem_1))
14766         return false;
14767
14768       /* In increasing order, the last load can clobber the address.  */
14769       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14770       return false;
14771     }
14772
14773   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14774     rclass_1 = FP_REGS;
14775   else
14776     rclass_1 = GENERAL_REGS;
14777
14778   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14779     rclass_2 = FP_REGS;
14780   else
14781     rclass_2 = GENERAL_REGS;
14782
14783   /* Check if the registers are of same class.  */
14784   if (rclass_1 != rclass_2)
14785     return false;
14786
14787   return true;
14788 }
14789
14790 /* Given OPERANDS of consecutive load/store, check if we can merge
14791    them into ldp/stp by adjusting the offset.  LOAD is true if they
14792    are load instructions.  MODE is the mode of memory operands.
14793
14794    Given below consecutive stores:
14795
14796      str  w1, [xb, 0x100]
14797      str  w1, [xb, 0x104]
14798      str  w1, [xb, 0x108]
14799      str  w1, [xb, 0x10c]
14800
14801    Though the offsets are out of the range supported by stp, we can
14802    still pair them after adjusting the offset, like:
14803
14804      add  scratch, xb, 0x100
14805      stp  w1, w1, [scratch]
14806      stp  w1, w1, [scratch, 0x8]
14807
14808    The peephole patterns detecting this opportunity should guarantee
14809    the scratch register is avaliable.  */
14810
14811 bool
14812 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14813                                        scalar_mode mode)
14814 {
14815   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14816   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14817   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14818   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14819
14820   if (load)
14821     {
14822       reg_1 = operands[0];
14823       mem_1 = operands[1];
14824       reg_2 = operands[2];
14825       mem_2 = operands[3];
14826       reg_3 = operands[4];
14827       mem_3 = operands[5];
14828       reg_4 = operands[6];
14829       mem_4 = operands[7];
14830       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14831                   && REG_P (reg_3) && REG_P (reg_4));
14832       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14833         return false;
14834     }
14835   else
14836     {
14837       mem_1 = operands[0];
14838       reg_1 = operands[1];
14839       mem_2 = operands[2];
14840       reg_2 = operands[3];
14841       mem_3 = operands[4];
14842       reg_3 = operands[5];
14843       mem_4 = operands[6];
14844       reg_4 = operands[7];
14845     }
14846   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14847   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14848     return false;
14849
14850   /* The mems cannot be volatile.  */
14851   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14852       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14853     return false;
14854
14855   /* Check if the addresses are in the form of [base+offset].  */
14856   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14857   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14858     return false;
14859   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14860   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14861     return false;
14862   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14863   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14864     return false;
14865   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14866   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14867     return false;
14868
14869   /* Check if the bases are same.  */
14870   if (!rtx_equal_p (base_1, base_2)
14871       || !rtx_equal_p (base_2, base_3)
14872       || !rtx_equal_p (base_3, base_4))
14873     return false;
14874
14875   offval_1 = INTVAL (offset_1);
14876   offval_2 = INTVAL (offset_2);
14877   offval_3 = INTVAL (offset_3);
14878   offval_4 = INTVAL (offset_4);
14879   msize = GET_MODE_SIZE (mode);
14880   /* Check if the offsets are consecutive.  */
14881   if ((offval_1 != (offval_2 + msize)
14882        || offval_1 != (offval_3 + msize * 2)
14883        || offval_1 != (offval_4 + msize * 3))
14884       && (offval_4 != (offval_3 + msize)
14885           || offval_4 != (offval_2 + msize * 2)
14886           || offval_4 != (offval_1 + msize * 3)))
14887     return false;
14888
14889   /* Check if the addresses are clobbered by load.  */
14890   if (load)
14891     {
14892       if (reg_mentioned_p (reg_1, mem_1)
14893           || reg_mentioned_p (reg_2, mem_2)
14894           || reg_mentioned_p (reg_3, mem_3))
14895         return false;
14896
14897       /* In increasing order, the last load can clobber the address.  */
14898       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14899         return false;
14900     }
14901
14902   /* If we have SImode and slow unaligned ldp,
14903      check the alignment to be at least 8 byte. */
14904   if (mode == SImode
14905       && (aarch64_tune_params.extra_tuning_flags
14906           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14907       && !optimize_size
14908       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14909     return false;
14910
14911   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14912     rclass_1 = FP_REGS;
14913   else
14914     rclass_1 = GENERAL_REGS;
14915
14916   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14917     rclass_2 = FP_REGS;
14918   else
14919     rclass_2 = GENERAL_REGS;
14920
14921   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14922     rclass_3 = FP_REGS;
14923   else
14924     rclass_3 = GENERAL_REGS;
14925
14926   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14927     rclass_4 = FP_REGS;
14928   else
14929     rclass_4 = GENERAL_REGS;
14930
14931   /* Check if the registers are of same class.  */
14932   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14933     return false;
14934
14935   return true;
14936 }
14937
14938 /* Given OPERANDS of consecutive load/store, this function pairs them
14939    into ldp/stp after adjusting the offset.  It depends on the fact
14940    that addresses of load/store instructions are in increasing order.
14941    MODE is the mode of memory operands.  CODE is the rtl operator
14942    which should be applied to all memory operands, it's SIGN_EXTEND,
14943    ZERO_EXTEND or UNKNOWN.  */
14944
14945 bool
14946 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14947                              scalar_mode mode, RTX_CODE code)
14948 {
14949   rtx base, offset, t1, t2;
14950   rtx mem_1, mem_2, mem_3, mem_4;
14951   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14952
14953   if (load)
14954     {
14955       mem_1 = operands[1];
14956       mem_2 = operands[3];
14957       mem_3 = operands[5];
14958       mem_4 = operands[7];
14959     }
14960   else
14961     {
14962       mem_1 = operands[0];
14963       mem_2 = operands[2];
14964       mem_3 = operands[4];
14965       mem_4 = operands[6];
14966       gcc_assert (code == UNKNOWN);
14967     }
14968
14969   extract_base_offset_in_addr (mem_1, &base, &offset);
14970   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14971
14972   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14973   msize = GET_MODE_SIZE (mode);
14974   stp_off_limit = msize * 0x40;
14975   off_val = INTVAL (offset);
14976   abs_off = (off_val < 0) ? -off_val : off_val;
14977   new_off = abs_off % stp_off_limit;
14978   adj_off = abs_off - new_off;
14979
14980   /* Further adjust to make sure all offsets are OK.  */
14981   if ((new_off + msize * 2) >= stp_off_limit)
14982     {
14983       adj_off += stp_off_limit;
14984       new_off -= stp_off_limit;
14985     }
14986
14987   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14988   if (adj_off >= 0x1000)
14989     return false;
14990
14991   if (off_val < 0)
14992     {
14993       adj_off = -adj_off;
14994       new_off = -new_off;
14995     }
14996
14997   /* Create new memory references.  */
14998   mem_1 = change_address (mem_1, VOIDmode,
14999                           plus_constant (DImode, operands[8], new_off));
15000
15001   /* Check if the adjusted address is OK for ldp/stp.  */
15002   if (!aarch64_mem_pair_operand (mem_1, mode))
15003     return false;
15004
15005   msize = GET_MODE_SIZE (mode);
15006   mem_2 = change_address (mem_2, VOIDmode,
15007                           plus_constant (DImode,
15008                                          operands[8],
15009                                          new_off + msize));
15010   mem_3 = change_address (mem_3, VOIDmode,
15011                           plus_constant (DImode,
15012                                          operands[8],
15013                                          new_off + msize * 2));
15014   mem_4 = change_address (mem_4, VOIDmode,
15015                           plus_constant (DImode,
15016                                          operands[8],
15017                                          new_off + msize * 3));
15018
15019   if (code == ZERO_EXTEND)
15020     {
15021       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15022       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15023       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15024       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15025     }
15026   else if (code == SIGN_EXTEND)
15027     {
15028       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15029       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15030       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15031       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15032     }
15033
15034   if (load)
15035     {
15036       operands[1] = mem_1;
15037       operands[3] = mem_2;
15038       operands[5] = mem_3;
15039       operands[7] = mem_4;
15040     }
15041   else
15042     {
15043       operands[0] = mem_1;
15044       operands[2] = mem_2;
15045       operands[4] = mem_3;
15046       operands[6] = mem_4;
15047     }
15048
15049   /* Emit adjusting instruction.  */
15050   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15051   /* Emit ldp/stp instructions.  */
15052   t1 = gen_rtx_SET (operands[0], operands[1]);
15053   t2 = gen_rtx_SET (operands[2], operands[3]);
15054   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15055   t1 = gen_rtx_SET (operands[4], operands[5]);
15056   t2 = gen_rtx_SET (operands[6], operands[7]);
15057   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15058   return true;
15059 }
15060
15061 /* Return 1 if pseudo register should be created and used to hold
15062    GOT address for PIC code.  */
15063
15064 bool
15065 aarch64_use_pseudo_pic_reg (void)
15066 {
15067   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15068 }
15069
15070 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15071
15072 static int
15073 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15074 {
15075   switch (XINT (x, 1))
15076     {
15077     case UNSPEC_GOTSMALLPIC:
15078     case UNSPEC_GOTSMALLPIC28K:
15079     case UNSPEC_GOTTINYPIC:
15080       return 0;
15081     default:
15082       break;
15083     }
15084
15085   return default_unspec_may_trap_p (x, flags);
15086 }
15087
15088
15089 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15090    return the log2 of that value.  Otherwise return -1.  */
15091
15092 int
15093 aarch64_fpconst_pow_of_2 (rtx x)
15094 {
15095   const REAL_VALUE_TYPE *r;
15096
15097   if (!CONST_DOUBLE_P (x))
15098     return -1;
15099
15100   r = CONST_DOUBLE_REAL_VALUE (x);
15101
15102   if (REAL_VALUE_NEGATIVE (*r)
15103       || REAL_VALUE_ISNAN (*r)
15104       || REAL_VALUE_ISINF (*r)
15105       || !real_isinteger (r, DFmode))
15106     return -1;
15107
15108   return exact_log2 (real_to_integer (r));
15109 }
15110
15111 /* If X is a vector of equal CONST_DOUBLE values and that value is
15112    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15113
15114 int
15115 aarch64_vec_fpconst_pow_of_2 (rtx x)
15116 {
15117   if (GET_CODE (x) != CONST_VECTOR)
15118     return -1;
15119
15120   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15121     return -1;
15122
15123   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15124   if (firstval <= 0)
15125     return -1;
15126
15127   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15128     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15129       return -1;
15130
15131   return firstval;
15132 }
15133
15134 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15135    to float.
15136
15137    __fp16 always promotes through this hook.
15138    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15139    through the generic excess precision logic rather than here.  */
15140
15141 static tree
15142 aarch64_promoted_type (const_tree t)
15143 {
15144   if (SCALAR_FLOAT_TYPE_P (t)
15145       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15146     return float_type_node;
15147
15148   return NULL_TREE;
15149 }
15150
15151 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15152
15153 static bool
15154 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15155                            optimization_type opt_type)
15156 {
15157   switch (op)
15158     {
15159     case rsqrt_optab:
15160       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15161
15162     default:
15163       return true;
15164     }
15165 }
15166
15167 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15168    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15169
15170 static bool
15171 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15172 {
15173   return (mode == HFmode
15174           ? true
15175           : default_libgcc_floating_mode_supported_p (mode));
15176 }
15177
15178 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15179    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15180
15181 static bool
15182 aarch64_scalar_mode_supported_p (scalar_mode mode)
15183 {
15184   return (mode == HFmode
15185           ? true
15186           : default_scalar_mode_supported_p (mode));
15187 }
15188
15189 /* Set the value of FLT_EVAL_METHOD.
15190    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15191
15192     0: evaluate all operations and constants, whose semantic type has at
15193        most the range and precision of type float, to the range and
15194        precision of float; evaluate all other operations and constants to
15195        the range and precision of the semantic type;
15196
15197     N, where _FloatN is a supported interchange floating type
15198        evaluate all operations and constants, whose semantic type has at
15199        most the range and precision of _FloatN type, to the range and
15200        precision of the _FloatN type; evaluate all other operations and
15201        constants to the range and precision of the semantic type;
15202
15203    If we have the ARMv8.2-A extensions then we support _Float16 in native
15204    precision, so we should set this to 16.  Otherwise, we support the type,
15205    but want to evaluate expressions in float precision, so set this to
15206    0.  */
15207
15208 static enum flt_eval_method
15209 aarch64_excess_precision (enum excess_precision_type type)
15210 {
15211   switch (type)
15212     {
15213       case EXCESS_PRECISION_TYPE_FAST:
15214       case EXCESS_PRECISION_TYPE_STANDARD:
15215         /* We can calculate either in 16-bit range and precision or
15216            32-bit range and precision.  Make that decision based on whether
15217            we have native support for the ARMv8.2-A 16-bit floating-point
15218            instructions or not.  */
15219         return (TARGET_FP_F16INST
15220                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15221                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15222       case EXCESS_PRECISION_TYPE_IMPLICIT:
15223         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15224       default:
15225         gcc_unreachable ();
15226     }
15227   return FLT_EVAL_METHOD_UNPREDICTABLE;
15228 }
15229
15230 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15231    scheduled for speculative execution.  Reject the long-running division
15232    and square-root instructions.  */
15233
15234 static bool
15235 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15236 {
15237   switch (get_attr_type (insn))
15238     {
15239       case TYPE_SDIV:
15240       case TYPE_UDIV:
15241       case TYPE_FDIVS:
15242       case TYPE_FDIVD:
15243       case TYPE_FSQRTS:
15244       case TYPE_FSQRTD:
15245       case TYPE_NEON_FP_SQRT_S:
15246       case TYPE_NEON_FP_SQRT_D:
15247       case TYPE_NEON_FP_SQRT_S_Q:
15248       case TYPE_NEON_FP_SQRT_D_Q:
15249       case TYPE_NEON_FP_DIV_S:
15250       case TYPE_NEON_FP_DIV_D:
15251       case TYPE_NEON_FP_DIV_S_Q:
15252       case TYPE_NEON_FP_DIV_D_Q:
15253         return false;
15254       default:
15255         return true;
15256     }
15257 }
15258
15259 /* Target-specific selftests.  */
15260
15261 #if CHECKING_P
15262
15263 namespace selftest {
15264
15265 /* Selftest for the RTL loader.
15266    Verify that the RTL loader copes with a dump from
15267    print_rtx_function.  This is essentially just a test that class
15268    function_reader can handle a real dump, but it also verifies
15269    that lookup_reg_by_dump_name correctly handles hard regs.
15270    The presence of hard reg names in the dump means that the test is
15271    target-specific, hence it is in this file.  */
15272
15273 static void
15274 aarch64_test_loading_full_dump ()
15275 {
15276   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15277
15278   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15279
15280   rtx_insn *insn_1 = get_insn_by_uid (1);
15281   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15282
15283   rtx_insn *insn_15 = get_insn_by_uid (15);
15284   ASSERT_EQ (INSN, GET_CODE (insn_15));
15285   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15286
15287   /* Verify crtl->return_rtx.  */
15288   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15289   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15290   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15291 }
15292
15293 /* Run all target-specific selftests.  */
15294
15295 static void
15296 aarch64_run_selftests (void)
15297 {
15298   aarch64_test_loading_full_dump ();
15299 }
15300
15301 } // namespace selftest
15302
15303 #endif /* #if CHECKING_P */
15304
15305 #undef TARGET_ADDRESS_COST
15306 #define TARGET_ADDRESS_COST aarch64_address_cost
15307
15308 /* This hook will determines whether unnamed bitfields affect the alignment
15309    of the containing structure.  The hook returns true if the structure
15310    should inherit the alignment requirements of an unnamed bitfield's
15311    type.  */
15312 #undef TARGET_ALIGN_ANON_BITFIELD
15313 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15314
15315 #undef TARGET_ASM_ALIGNED_DI_OP
15316 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15317
15318 #undef TARGET_ASM_ALIGNED_HI_OP
15319 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15320
15321 #undef TARGET_ASM_ALIGNED_SI_OP
15322 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15323
15324 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15325 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15326   hook_bool_const_tree_hwi_hwi_const_tree_true
15327
15328 #undef TARGET_ASM_FILE_START
15329 #define TARGET_ASM_FILE_START aarch64_start_file
15330
15331 #undef TARGET_ASM_OUTPUT_MI_THUNK
15332 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15333
15334 #undef TARGET_ASM_SELECT_RTX_SECTION
15335 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15336
15337 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15338 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15339
15340 #undef TARGET_BUILD_BUILTIN_VA_LIST
15341 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15342
15343 #undef TARGET_CALLEE_COPIES
15344 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15345
15346 #undef TARGET_CAN_ELIMINATE
15347 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15348
15349 #undef TARGET_CAN_INLINE_P
15350 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15351
15352 #undef TARGET_CANNOT_FORCE_CONST_MEM
15353 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15354
15355 #undef TARGET_CASE_VALUES_THRESHOLD
15356 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15357
15358 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15359 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15360
15361 /* Only the least significant bit is used for initialization guard
15362    variables.  */
15363 #undef TARGET_CXX_GUARD_MASK_BIT
15364 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15365
15366 #undef TARGET_C_MODE_FOR_SUFFIX
15367 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15368
15369 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15370 #undef  TARGET_DEFAULT_TARGET_FLAGS
15371 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15372 #endif
15373
15374 #undef TARGET_CLASS_MAX_NREGS
15375 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15376
15377 #undef TARGET_BUILTIN_DECL
15378 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15379
15380 #undef TARGET_BUILTIN_RECIPROCAL
15381 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15382
15383 #undef TARGET_C_EXCESS_PRECISION
15384 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15385
15386 #undef  TARGET_EXPAND_BUILTIN
15387 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15388
15389 #undef TARGET_EXPAND_BUILTIN_VA_START
15390 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15391
15392 #undef TARGET_FOLD_BUILTIN
15393 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15394
15395 #undef TARGET_FUNCTION_ARG
15396 #define TARGET_FUNCTION_ARG aarch64_function_arg
15397
15398 #undef TARGET_FUNCTION_ARG_ADVANCE
15399 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15400
15401 #undef TARGET_FUNCTION_ARG_BOUNDARY
15402 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15403
15404 #undef TARGET_FUNCTION_ARG_PADDING
15405 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15406
15407 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15408 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15409
15410 #undef TARGET_FUNCTION_VALUE
15411 #define TARGET_FUNCTION_VALUE aarch64_function_value
15412
15413 #undef TARGET_FUNCTION_VALUE_REGNO_P
15414 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15415
15416 #undef TARGET_FRAME_POINTER_REQUIRED
15417 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15418
15419 #undef TARGET_GIMPLE_FOLD_BUILTIN
15420 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15421
15422 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15423 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15424
15425 #undef  TARGET_INIT_BUILTINS
15426 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15427
15428 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15429 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15430   aarch64_ira_change_pseudo_allocno_class
15431
15432 #undef TARGET_LEGITIMATE_ADDRESS_P
15433 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15434
15435 #undef TARGET_LEGITIMATE_CONSTANT_P
15436 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15437
15438 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15439 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15440   aarch64_legitimize_address_displacement
15441
15442 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15443 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15444
15445 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15446 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15447 aarch64_libgcc_floating_mode_supported_p
15448
15449 #undef TARGET_MANGLE_TYPE
15450 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15451
15452 #undef TARGET_MEMORY_MOVE_COST
15453 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15454
15455 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15456 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15457
15458 #undef TARGET_MUST_PASS_IN_STACK
15459 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15460
15461 /* This target hook should return true if accesses to volatile bitfields
15462    should use the narrowest mode possible.  It should return false if these
15463    accesses should use the bitfield container type.  */
15464 #undef TARGET_NARROW_VOLATILE_BITFIELD
15465 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15466
15467 #undef  TARGET_OPTION_OVERRIDE
15468 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15469
15470 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15471 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15472   aarch64_override_options_after_change
15473
15474 #undef TARGET_OPTION_SAVE
15475 #define TARGET_OPTION_SAVE aarch64_option_save
15476
15477 #undef TARGET_OPTION_RESTORE
15478 #define TARGET_OPTION_RESTORE aarch64_option_restore
15479
15480 #undef TARGET_OPTION_PRINT
15481 #define TARGET_OPTION_PRINT aarch64_option_print
15482
15483 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15484 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15485
15486 #undef TARGET_SET_CURRENT_FUNCTION
15487 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15488
15489 #undef TARGET_PASS_BY_REFERENCE
15490 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15491
15492 #undef TARGET_PREFERRED_RELOAD_CLASS
15493 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15494
15495 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15496 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15497
15498 #undef TARGET_PROMOTED_TYPE
15499 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15500
15501 #undef TARGET_SECONDARY_RELOAD
15502 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15503
15504 #undef TARGET_SHIFT_TRUNCATION_MASK
15505 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15506
15507 #undef TARGET_SETUP_INCOMING_VARARGS
15508 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15509
15510 #undef TARGET_STRUCT_VALUE_RTX
15511 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15512
15513 #undef TARGET_REGISTER_MOVE_COST
15514 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15515
15516 #undef TARGET_RETURN_IN_MEMORY
15517 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15518
15519 #undef TARGET_RETURN_IN_MSB
15520 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15521
15522 #undef TARGET_RTX_COSTS
15523 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15524
15525 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15526 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15527
15528 #undef TARGET_SCHED_ISSUE_RATE
15529 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15530
15531 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15532 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15533   aarch64_sched_first_cycle_multipass_dfa_lookahead
15534
15535 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15536 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15537   aarch64_first_cycle_multipass_dfa_lookahead_guard
15538
15539 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15540 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15541   aarch64_get_separate_components
15542
15543 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15544 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15545   aarch64_components_for_bb
15546
15547 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15548 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15549   aarch64_disqualify_components
15550
15551 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15552 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15553   aarch64_emit_prologue_components
15554
15555 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15556 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15557   aarch64_emit_epilogue_components
15558
15559 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15560 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15561   aarch64_set_handled_components
15562
15563 #undef TARGET_TRAMPOLINE_INIT
15564 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15565
15566 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15567 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15568
15569 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15570 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15571
15572 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15573 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15574   aarch64_builtin_support_vector_misalignment
15575
15576 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15577 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15578
15579 #undef TARGET_VECTORIZE_ADD_STMT_COST
15580 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15581
15582 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15583 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15584   aarch64_builtin_vectorization_cost
15585
15586 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15587 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15588
15589 #undef TARGET_VECTORIZE_BUILTINS
15590 #define TARGET_VECTORIZE_BUILTINS
15591
15592 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15593 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15594   aarch64_builtin_vectorized_function
15595
15596 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15597 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15598   aarch64_autovectorize_vector_sizes
15599
15600 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15601 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15602   aarch64_atomic_assign_expand_fenv
15603
15604 /* Section anchor support.  */
15605
15606 #undef TARGET_MIN_ANCHOR_OFFSET
15607 #define TARGET_MIN_ANCHOR_OFFSET -256
15608
15609 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15610    byte offset; we can do much more for larger data types, but have no way
15611    to determine the size of the access.  We assume accesses are aligned.  */
15612 #undef TARGET_MAX_ANCHOR_OFFSET
15613 #define TARGET_MAX_ANCHOR_OFFSET 4095
15614
15615 #undef TARGET_VECTOR_ALIGNMENT
15616 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15617
15618 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15619 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15620   aarch64_simd_vector_alignment_reachable
15621
15622 /* vec_perm support.  */
15623
15624 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15625 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15626   aarch64_vectorize_vec_perm_const_ok
15627
15628 #undef TARGET_INIT_LIBFUNCS
15629 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15630
15631 #undef TARGET_FIXED_CONDITION_CODE_REGS
15632 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15633
15634 #undef TARGET_FLAGS_REGNUM
15635 #define TARGET_FLAGS_REGNUM CC_REGNUM
15636
15637 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15638 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15639
15640 #undef TARGET_ASAN_SHADOW_OFFSET
15641 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15642
15643 #undef TARGET_LEGITIMIZE_ADDRESS
15644 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15645
15646 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15647 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15648   aarch64_use_by_pieces_infrastructure_p
15649
15650 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15651 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15652
15653 #undef TARGET_CAN_USE_DOLOOP_P
15654 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15655
15656 #undef TARGET_SCHED_ADJUST_PRIORITY
15657 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15658
15659 #undef TARGET_SCHED_MACRO_FUSION_P
15660 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15661
15662 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15663 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15664
15665 #undef TARGET_SCHED_FUSION_PRIORITY
15666 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15667
15668 #undef TARGET_UNSPEC_MAY_TRAP_P
15669 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15670
15671 #undef TARGET_USE_PSEUDO_PIC_REG
15672 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15673
15674 #undef TARGET_PRINT_OPERAND
15675 #define TARGET_PRINT_OPERAND aarch64_print_operand
15676
15677 #undef TARGET_PRINT_OPERAND_ADDRESS
15678 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15679
15680 #undef TARGET_OPTAB_SUPPORTED_P
15681 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15682
15683 #undef TARGET_OMIT_STRUCT_RETURN_REG
15684 #define TARGET_OMIT_STRUCT_RETURN_REG true
15685
15686 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15687 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15688 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15689
15690 #undef TARGET_HARD_REGNO_MODE_OK
15691 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15692
15693 #undef TARGET_MODES_TIEABLE_P
15694 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15695
15696 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15697 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15698   aarch64_hard_regno_call_part_clobbered
15699
15700 #if CHECKING_P
15701 #undef TARGET_RUN_TARGET_SELFTESTS
15702 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15703 #endif /* #if CHECKING_P */
15704
15705 struct gcc_target targetm = TARGET_INITIALIZER;
15706
15707 #include "gt-aarch64.h"