gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode,
 145                                                  vec_perm_indices);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (scalar_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 static const struct tune_params thunderx2t99_tunings =
 826 {
 827   &thunderx2t99_extra_costs,
 828   &thunderx2t99_addrcost_table,
 829   &thunderx2t99_regmove_cost,
 830   &thunderx2t99_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   4, /* memmov_cost.  */
 834   4, /* issue_rate.  */
 835   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 836    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 837   16,   /* function_align.  */
 838   8,    /* jump_align.  */
 839   16,   /* loop_align.  */
 840   3,    /* int_reassoc_width.  */
 841   2,    /* fp_reassoc_width.  */
 842   2,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &thunderx2t99_prefetch_tune
 849 };
 850
 851 /* Support for fine-grained override of the tuning structures.  */
 852 struct aarch64_tuning_override_function
 853 {
 854   const char* name;
 855   void (*parse_override)(const char*, struct tune_params*);
 856 };
 857
 858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 860
 861 static const struct aarch64_tuning_override_function
 862 aarch64_tuning_override_functions[] =
 863 {
 864   { "fuse", aarch64_parse_fuse_string },
 865   { "tune", aarch64_parse_tune_string },
 866   { NULL, NULL }
 867 };
 868
 869 /* A processor implementing AArch64.  */
 870 struct processor
 871 {
 872   const char *const name;
 873   enum aarch64_processor ident;
 874   enum aarch64_processor sched_core;
 875   enum aarch64_arch arch;
 876   unsigned architecture_version;
 877   const unsigned long flags;
 878   const struct tune_params *const tune;
 879 };
 880
 881 /* Architectures implementing AArch64.  */
 882 static const struct processor all_architectures[] =
 883 {
 884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 885   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 886 #include "aarch64-arches.def"
 887   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 888 };
 889
 890 /* Processor cores implementing AArch64.  */
 891 static const struct processor all_cores[] =
 892 {
 893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 894   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 895   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 896   FLAGS, &COSTS##_tunings},
 897 #include "aarch64-cores.def"
 898   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 899     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 900   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 901 };
 902
 903
 904 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 905    handling code or by target attributes.  */
 906 static const struct processor *selected_arch;
 907 static const struct processor *selected_cpu;
 908 static const struct processor *selected_tune;
 909
 910 /* The current tuning set.  */
 911 struct tune_params aarch64_tune_params = generic_tunings;
 912
 913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 914
 915 /* An ISA extension in the co-processor and main instruction set space.  */
 916 struct aarch64_option_extension
 917 {
 918   const char *const name;
 919   const unsigned long flags_on;
 920   const unsigned long flags_off;
 921 };
 922
 923 typedef enum aarch64_cond_code
 924 {
 925   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 926   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 927   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 928 }
 929 aarch64_cc;
 930
 931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 932
 933 /* The condition codes of the processor, and the inverse function.  */
 934 static const char * const aarch64_condition_codes[] =
 935 {
 936   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 937   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 938 };
 939
 940 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 941 const char *
 942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 943                         const char * branch_format)
 944 {
 945     rtx_code_label * tmp_label = gen_label_rtx ();
 946     char label_buf[256];
 947     char buffer[128];
 948     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 949                                  CODE_LABEL_NUMBER (tmp_label));
 950     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 951     rtx dest_label = operands[pos_label];
 952     operands[pos_label] = tmp_label;
 953
 954     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 955     output_asm_insn (buffer, operands);
 956
 957     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 958     operands[pos_label] = dest_label;
 959     output_asm_insn (buffer, operands);
 960     return "";
 961 }
 962
 963 void
 964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 965 {
 966   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 967   if (TARGET_GENERAL_REGS_ONLY)
 968     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 969   else
 970     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 971 }
 972
 973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 974    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 975    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 976    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 977    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 978    irrespectively of its cost results in bad allocations with many redundant
 979    int<->FP moves which are expensive on various cores.
 980    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 981    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 982    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 983    Otherwise set the allocno class depending on the mode.
 984    The result of this is that it is no longer inefficient to have a higher
 985    memory move cost than the register move cost.
 986 */
 987
 988 static reg_class_t
 989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 990                                          reg_class_t best_class)
 991 {
 992   machine_mode mode;
 993
 994   if (allocno_class != ALL_REGS)
 995     return allocno_class;
 996
 997   if (best_class != ALL_REGS)
 998     return best_class;
 999
1000   mode = PSEUDO_REGNO_MODE (regno);
1001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007   if (GET_MODE_UNIT_SIZE (mode) == 4)
1008     return aarch64_tune_params.min_div_recip_mul_sf;
1009   return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014                              machine_mode mode)
1015 {
1016   if (VECTOR_MODE_P (mode))
1017     return aarch64_tune_params.vec_reassoc_width;
1018   if (INTEGRAL_MODE_P (mode))
1019     return aarch64_tune_params.int_reassoc_width;
1020   if (FLOAT_MODE_P (mode))
1021     return aarch64_tune_params.fp_reassoc_width;
1022   return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029    if (GP_REGNUM_P (regno))
1030      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031    else if (regno == SP_REGNUM)
1032      return AARCH64_DWARF_SP;
1033    else if (FP_REGNUM_P (regno))
1034      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037       equivalent DWARF register.  */
1038    return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes.  */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045   return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes.  */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052   return aarch64_vector_mode_supported_p (mode)
1053          || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059                                 unsigned HOST_WIDE_INT nelems)
1060 {
1061   if (TARGET_SIMD
1062       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064       && (nelems >= 2 && nelems <= 4))
1065     return true;
1066
1067   return false;
1068 }
1069
1070 /* Implement TARGET_HARD_REGNO_NREGS.  */
1071
1072 static unsigned int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075   switch (aarch64_regno_regclass (regno))
1076     {
1077     case FP_REGS:
1078     case FP_LO_REGS:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080     default:
1081       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082     }
1083   gcc_unreachable ();
1084 }
1085
1086 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1087
1088 static bool
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091   if (GET_MODE_CLASS (mode) == MODE_CC)
1092     return regno == CC_REGNUM;
1093
1094   if (regno == SP_REGNUM)
1095     /* The purpose of comparing with ptr_mode is to support the
1096        global register variable associated with the stack pointer
1097        register via the syntax of asm ("wsp") in ILP32.  */
1098     return mode == Pmode || mode == ptr_mode;
1099
1100   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101     return mode == Pmode;
1102
1103   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104     return true;
1105
1106   if (FP_REGNUM_P (regno))
1107     {
1108       if (aarch64_vect_struct_mode_p (mode))
1109         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1110       else
1111         return true;
1112     }
1113
1114   return false;
1115 }
1116
1117 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1118    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1119    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1120
1121 static bool
1122 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1123 {
1124   return FP_REGNUM_P (regno) && GET_MODE_SIZE (mode) > 8;
1125 }
1126
1127 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1128 machine_mode
1129 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1130                                      machine_mode mode)
1131 {
1132   /* Handle modes that fit within single registers.  */
1133   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1134     {
1135       if (GET_MODE_SIZE (mode) >= 4)
1136         return mode;
1137       else
1138         return SImode;
1139     }
1140   /* Fall back to generic for multi-reg and very large modes.  */
1141   else
1142     return choose_hard_reg_mode (regno, nregs, false);
1143 }
1144
1145 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1146    that strcpy from constants will be faster.  */
1147
1148 static HOST_WIDE_INT
1149 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1150 {
1151   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1152     return MAX (align, BITS_PER_WORD);
1153   return align;
1154 }
1155
1156 /* Return true if calls to DECL should be treated as
1157    long-calls (ie called via a register).  */
1158 static bool
1159 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1160 {
1161   return false;
1162 }
1163
1164 /* Return true if calls to symbol-ref SYM should be treated as
1165    long-calls (ie called via a register).  */
1166 bool
1167 aarch64_is_long_call_p (rtx sym)
1168 {
1169   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1170 }
1171
1172 /* Return true if calls to symbol-ref SYM should not go through
1173    plt stubs.  */
1174
1175 bool
1176 aarch64_is_noplt_call_p (rtx sym)
1177 {
1178   const_tree decl = SYMBOL_REF_DECL (sym);
1179
1180   if (flag_pic
1181       && decl
1182       && (!flag_plt
1183           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1184       && !targetm.binds_local_p (decl))
1185     return true;
1186
1187   return false;
1188 }
1189
1190 /* Return true if the offsets to a zero/sign-extract operation
1191    represent an expression that matches an extend operation.  The
1192    operands represent the paramters from
1193
1194    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1195 bool
1196 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1197                                 rtx extract_imm)
1198 {
1199   HOST_WIDE_INT mult_val, extract_val;
1200
1201   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1202     return false;
1203
1204   mult_val = INTVAL (mult_imm);
1205   extract_val = INTVAL (extract_imm);
1206
1207   if (extract_val > 8
1208       && extract_val < GET_MODE_BITSIZE (mode)
1209       && exact_log2 (extract_val & ~7) > 0
1210       && (extract_val & 7) <= 4
1211       && mult_val == (1 << (extract_val & 7)))
1212     return true;
1213
1214   return false;
1215 }
1216
1217 /* Emit an insn that's a simple single-set.  Both the operands must be
1218    known to be valid.  */
1219 inline static rtx_insn *
1220 emit_set_insn (rtx x, rtx y)
1221 {
1222   return emit_insn (gen_rtx_SET (x, y));
1223 }
1224
1225 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1226    return the rtx for register 0 in the proper mode.  */
1227 rtx
1228 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1229 {
1230   machine_mode mode = SELECT_CC_MODE (code, x, y);
1231   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1232
1233   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1234   return cc_reg;
1235 }
1236
1237 /* Build the SYMBOL_REF for __tls_get_addr.  */
1238
1239 static GTY(()) rtx tls_get_addr_libfunc;
1240
1241 rtx
1242 aarch64_tls_get_addr (void)
1243 {
1244   if (!tls_get_addr_libfunc)
1245     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1246   return tls_get_addr_libfunc;
1247 }
1248
1249 /* Return the TLS model to use for ADDR.  */
1250
1251 static enum tls_model
1252 tls_symbolic_operand_type (rtx addr)
1253 {
1254   enum tls_model tls_kind = TLS_MODEL_NONE;
1255   rtx sym, addend;
1256
1257   if (GET_CODE (addr) == CONST)
1258     {
1259       split_const (addr, &sym, &addend);
1260       if (GET_CODE (sym) == SYMBOL_REF)
1261         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1262     }
1263   else if (GET_CODE (addr) == SYMBOL_REF)
1264     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1265
1266   return tls_kind;
1267 }
1268
1269 /* We'll allow lo_sum's in addresses in our legitimate addresses
1270    so that combine would take care of combining addresses where
1271    necessary, but for generation purposes, we'll generate the address
1272    as :
1273    RTL                               Absolute
1274    tmp = hi (symbol_ref);            adrp  x1, foo
1275    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1276                                      nop
1277
1278    PIC                               TLS
1279    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1280    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1281                                      bl   __tls_get_addr
1282                                      nop
1283
1284    Load TLS symbol, depending on TLS mechanism and TLS access model.
1285
1286    Global Dynamic - Traditional TLS:
1287    adrp tmp, :tlsgd:imm
1288    add  dest, tmp, #:tlsgd_lo12:imm
1289    bl   __tls_get_addr
1290
1291    Global Dynamic - TLS Descriptors:
1292    adrp dest, :tlsdesc:imm
1293    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1294    add  dest, dest, #:tlsdesc_lo12:imm
1295    blr  tmp
1296    mrs  tp, tpidr_el0
1297    add  dest, dest, tp
1298
1299    Initial Exec:
1300    mrs  tp, tpidr_el0
1301    adrp tmp, :gottprel:imm
1302    ldr  dest, [tmp, #:gottprel_lo12:imm]
1303    add  dest, dest, tp
1304
1305    Local Exec:
1306    mrs  tp, tpidr_el0
1307    add  t0, tp, #:tprel_hi12:imm, lsl #12
1308    add  t0, t0, #:tprel_lo12_nc:imm
1309 */
1310
1311 static void
1312 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1313                                    enum aarch64_symbol_type type)
1314 {
1315   switch (type)
1316     {
1317     case SYMBOL_SMALL_ABSOLUTE:
1318       {
1319         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1320         rtx tmp_reg = dest;
1321         machine_mode mode = GET_MODE (dest);
1322
1323         gcc_assert (mode == Pmode || mode == ptr_mode);
1324
1325         if (can_create_pseudo_p ())
1326           tmp_reg = gen_reg_rtx (mode);
1327
1328         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1329         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1330         return;
1331       }
1332
1333     case SYMBOL_TINY_ABSOLUTE:
1334       emit_insn (gen_rtx_SET (dest, imm));
1335       return;
1336
1337     case SYMBOL_SMALL_GOT_28K:
1338       {
1339         machine_mode mode = GET_MODE (dest);
1340         rtx gp_rtx = pic_offset_table_rtx;
1341         rtx insn;
1342         rtx mem;
1343
1344         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1345            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1346            decide rtx costs, in which case pic_offset_table_rtx is not
1347            initialized.  For that case no need to generate the first adrp
1348            instruction as the final cost for global variable access is
1349            one instruction.  */
1350         if (gp_rtx != NULL)
1351           {
1352             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1353                using the page base as GOT base, the first page may be wasted,
1354                in the worst scenario, there is only 28K space for GOT).
1355
1356                The generate instruction sequence for accessing global variable
1357                is:
1358
1359                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1360
1361                Only one instruction needed. But we must initialize
1362                pic_offset_table_rtx properly.  We generate initialize insn for
1363                every global access, and allow CSE to remove all redundant.
1364
1365                The final instruction sequences will look like the following
1366                for multiply global variables access.
1367
1368                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1369
1370                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1371                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1372                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1373                  ...  */
1374
1375             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1376             crtl->uses_pic_offset_table = 1;
1377             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1378
1379             if (mode != GET_MODE (gp_rtx))
1380              gp_rtx = gen_lowpart (mode, gp_rtx);
1381
1382           }
1383
1384         if (mode == ptr_mode)
1385           {
1386             if (mode == DImode)
1387               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1388             else
1389               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1390
1391             mem = XVECEXP (SET_SRC (insn), 0, 0);
1392           }
1393         else
1394           {
1395             gcc_assert (mode == Pmode);
1396
1397             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1398             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1399           }
1400
1401         /* The operand is expected to be MEM.  Whenever the related insn
1402            pattern changed, above code which calculate mem should be
1403            updated.  */
1404         gcc_assert (GET_CODE (mem) == MEM);
1405         MEM_READONLY_P (mem) = 1;
1406         MEM_NOTRAP_P (mem) = 1;
1407         emit_insn (insn);
1408         return;
1409       }
1410
1411     case SYMBOL_SMALL_GOT_4G:
1412       {
1413         /* In ILP32, the mode of dest can be either SImode or DImode,
1414            while the got entry is always of SImode size.  The mode of
1415            dest depends on how dest is used: if dest is assigned to a
1416            pointer (e.g. in the memory), it has SImode; it may have
1417            DImode if dest is dereferenced to access the memeory.
1418            This is why we have to handle three different ldr_got_small
1419            patterns here (two patterns for ILP32).  */
1420
1421         rtx insn;
1422         rtx mem;
1423         rtx tmp_reg = dest;
1424         machine_mode mode = GET_MODE (dest);
1425
1426         if (can_create_pseudo_p ())
1427           tmp_reg = gen_reg_rtx (mode);
1428
1429         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1430         if (mode == ptr_mode)
1431           {
1432             if (mode == DImode)
1433               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1434             else
1435               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1436
1437             mem = XVECEXP (SET_SRC (insn), 0, 0);
1438           }
1439         else
1440           {
1441             gcc_assert (mode == Pmode);
1442
1443             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1444             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1445           }
1446
1447         gcc_assert (GET_CODE (mem) == MEM);
1448         MEM_READONLY_P (mem) = 1;
1449         MEM_NOTRAP_P (mem) = 1;
1450         emit_insn (insn);
1451         return;
1452       }
1453
1454     case SYMBOL_SMALL_TLSGD:
1455       {
1456         rtx_insn *insns;
1457         machine_mode mode = GET_MODE (dest);
1458         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1459
1460         start_sequence ();
1461         if (TARGET_ILP32)
1462           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1463         else
1464           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1465         insns = get_insns ();
1466         end_sequence ();
1467
1468         RTL_CONST_CALL_P (insns) = 1;
1469         emit_libcall_block (insns, dest, result, imm);
1470         return;
1471       }
1472
1473     case SYMBOL_SMALL_TLSDESC:
1474       {
1475         machine_mode mode = GET_MODE (dest);
1476         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1477         rtx tp;
1478
1479         gcc_assert (mode == Pmode || mode == ptr_mode);
1480
1481         /* In ILP32, the got entry is always of SImode size.  Unlike
1482            small GOT, the dest is fixed at reg 0.  */
1483         if (TARGET_ILP32)
1484           emit_insn (gen_tlsdesc_small_si (imm));
1485         else
1486           emit_insn (gen_tlsdesc_small_di (imm));
1487         tp = aarch64_load_tp (NULL);
1488
1489         if (mode != Pmode)
1490           tp = gen_lowpart (mode, tp);
1491
1492         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1493         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1494         return;
1495       }
1496
1497     case SYMBOL_SMALL_TLSIE:
1498       {
1499         /* In ILP32, the mode of dest can be either SImode or DImode,
1500            while the got entry is always of SImode size.  The mode of
1501            dest depends on how dest is used: if dest is assigned to a
1502            pointer (e.g. in the memory), it has SImode; it may have
1503            DImode if dest is dereferenced to access the memeory.
1504            This is why we have to handle three different tlsie_small
1505            patterns here (two patterns for ILP32).  */
1506         machine_mode mode = GET_MODE (dest);
1507         rtx tmp_reg = gen_reg_rtx (mode);
1508         rtx tp = aarch64_load_tp (NULL);
1509
1510         if (mode == ptr_mode)
1511           {
1512             if (mode == DImode)
1513               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1514             else
1515               {
1516                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1517                 tp = gen_lowpart (mode, tp);
1518               }
1519           }
1520         else
1521           {
1522             gcc_assert (mode == Pmode);
1523             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1524           }
1525
1526         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1527         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1528         return;
1529       }
1530
1531     case SYMBOL_TLSLE12:
1532     case SYMBOL_TLSLE24:
1533     case SYMBOL_TLSLE32:
1534     case SYMBOL_TLSLE48:
1535       {
1536         machine_mode mode = GET_MODE (dest);
1537         rtx tp = aarch64_load_tp (NULL);
1538
1539         if (mode != Pmode)
1540           tp = gen_lowpart (mode, tp);
1541
1542         switch (type)
1543           {
1544           case SYMBOL_TLSLE12:
1545             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1546                         (dest, tp, imm));
1547             break;
1548           case SYMBOL_TLSLE24:
1549             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1550                         (dest, tp, imm));
1551           break;
1552           case SYMBOL_TLSLE32:
1553             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1554                         (dest, imm));
1555             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1556                         (dest, dest, tp));
1557           break;
1558           case SYMBOL_TLSLE48:
1559             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1560                         (dest, imm));
1561             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1562                         (dest, dest, tp));
1563             break;
1564           default:
1565             gcc_unreachable ();
1566           }
1567
1568         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1569         return;
1570       }
1571
1572     case SYMBOL_TINY_GOT:
1573       emit_insn (gen_ldr_got_tiny (dest, imm));
1574       return;
1575
1576     case SYMBOL_TINY_TLSIE:
1577       {
1578         machine_mode mode = GET_MODE (dest);
1579         rtx tp = aarch64_load_tp (NULL);
1580
1581         if (mode == ptr_mode)
1582           {
1583             if (mode == DImode)
1584               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1585             else
1586               {
1587                 tp = gen_lowpart (mode, tp);
1588                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1589               }
1590           }
1591         else
1592           {
1593             gcc_assert (mode == Pmode);
1594             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1595           }
1596
1597         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1598         return;
1599       }
1600
1601     default:
1602       gcc_unreachable ();
1603     }
1604 }
1605
1606 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1607    handle all moves if !can_create_pseudo_p ().  The distinction is
1608    important because, unlike emit_move_insn, the move expanders know
1609    how to force Pmode objects into the constant pool even when the
1610    constant pool address is not itself legitimate.  */
1611 static rtx
1612 aarch64_emit_move (rtx dest, rtx src)
1613 {
1614   return (can_create_pseudo_p ()
1615           ? emit_move_insn (dest, src)
1616           : emit_move_insn_1 (dest, src));
1617 }
1618
1619 /* Split a 128-bit move operation into two 64-bit move operations,
1620    taking care to handle partial overlap of register to register
1621    copies.  Special cases are needed when moving between GP regs and
1622    FP regs.  SRC can be a register, constant or memory; DST a register
1623    or memory.  If either operand is memory it must not have any side
1624    effects.  */
1625 void
1626 aarch64_split_128bit_move (rtx dst, rtx src)
1627 {
1628   rtx dst_lo, dst_hi;
1629   rtx src_lo, src_hi;
1630
1631   machine_mode mode = GET_MODE (dst);
1632
1633   gcc_assert (mode == TImode || mode == TFmode);
1634   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1635   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1636
1637   if (REG_P (dst) && REG_P (src))
1638     {
1639       int src_regno = REGNO (src);
1640       int dst_regno = REGNO (dst);
1641
1642       /* Handle FP <-> GP regs.  */
1643       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1644         {
1645           src_lo = gen_lowpart (word_mode, src);
1646           src_hi = gen_highpart (word_mode, src);
1647
1648           if (mode == TImode)
1649             {
1650               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1651               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1652             }
1653           else
1654             {
1655               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1656               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1657             }
1658           return;
1659         }
1660       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1661         {
1662           dst_lo = gen_lowpart (word_mode, dst);
1663           dst_hi = gen_highpart (word_mode, dst);
1664
1665           if (mode == TImode)
1666             {
1667               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1668               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1669             }
1670           else
1671             {
1672               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1673               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1674             }
1675           return;
1676         }
1677     }
1678
1679   dst_lo = gen_lowpart (word_mode, dst);
1680   dst_hi = gen_highpart (word_mode, dst);
1681   src_lo = gen_lowpart (word_mode, src);
1682   src_hi = gen_highpart_mode (word_mode, mode, src);
1683
1684   /* At most one pairing may overlap.  */
1685   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1686     {
1687       aarch64_emit_move (dst_hi, src_hi);
1688       aarch64_emit_move (dst_lo, src_lo);
1689     }
1690   else
1691     {
1692       aarch64_emit_move (dst_lo, src_lo);
1693       aarch64_emit_move (dst_hi, src_hi);
1694     }
1695 }
1696
1697 bool
1698 aarch64_split_128bit_move_p (rtx dst, rtx src)
1699 {
1700   return (! REG_P (src)
1701           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1702 }
1703
1704 /* Split a complex SIMD combine.  */
1705
1706 void
1707 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1708 {
1709   machine_mode src_mode = GET_MODE (src1);
1710   machine_mode dst_mode = GET_MODE (dst);
1711
1712   gcc_assert (VECTOR_MODE_P (dst_mode));
1713   gcc_assert (register_operand (dst, dst_mode)
1714               && register_operand (src1, src_mode)
1715               && register_operand (src2, src_mode));
1716
1717   rtx (*gen) (rtx, rtx, rtx);
1718
1719   switch (src_mode)
1720     {
1721     case E_V8QImode:
1722       gen = gen_aarch64_simd_combinev8qi;
1723       break;
1724     case E_V4HImode:
1725       gen = gen_aarch64_simd_combinev4hi;
1726       break;
1727     case E_V2SImode:
1728       gen = gen_aarch64_simd_combinev2si;
1729       break;
1730     case E_V4HFmode:
1731       gen = gen_aarch64_simd_combinev4hf;
1732       break;
1733     case E_V2SFmode:
1734       gen = gen_aarch64_simd_combinev2sf;
1735       break;
1736     case E_DImode:
1737       gen = gen_aarch64_simd_combinedi;
1738       break;
1739     case E_DFmode:
1740       gen = gen_aarch64_simd_combinedf;
1741       break;
1742     default:
1743       gcc_unreachable ();
1744     }
1745
1746   emit_insn (gen (dst, src1, src2));
1747   return;
1748 }
1749
1750 /* Split a complex SIMD move.  */
1751
1752 void
1753 aarch64_split_simd_move (rtx dst, rtx src)
1754 {
1755   machine_mode src_mode = GET_MODE (src);
1756   machine_mode dst_mode = GET_MODE (dst);
1757
1758   gcc_assert (VECTOR_MODE_P (dst_mode));
1759
1760   if (REG_P (dst) && REG_P (src))
1761     {
1762       rtx (*gen) (rtx, rtx);
1763
1764       gcc_assert (VECTOR_MODE_P (src_mode));
1765
1766       switch (src_mode)
1767         {
1768         case E_V16QImode:
1769           gen = gen_aarch64_split_simd_movv16qi;
1770           break;
1771         case E_V8HImode:
1772           gen = gen_aarch64_split_simd_movv8hi;
1773           break;
1774         case E_V4SImode:
1775           gen = gen_aarch64_split_simd_movv4si;
1776           break;
1777         case E_V2DImode:
1778           gen = gen_aarch64_split_simd_movv2di;
1779           break;
1780         case E_V8HFmode:
1781           gen = gen_aarch64_split_simd_movv8hf;
1782           break;
1783         case E_V4SFmode:
1784           gen = gen_aarch64_split_simd_movv4sf;
1785           break;
1786         case E_V2DFmode:
1787           gen = gen_aarch64_split_simd_movv2df;
1788           break;
1789         default:
1790           gcc_unreachable ();
1791         }
1792
1793       emit_insn (gen (dst, src));
1794       return;
1795     }
1796 }
1797
1798 bool
1799 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1800                               machine_mode ymode, rtx y)
1801 {
1802   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1803   gcc_assert (r != NULL);
1804   return rtx_equal_p (x, r);
1805 }
1806
1807
1808 static rtx
1809 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1810 {
1811   if (can_create_pseudo_p ())
1812     return force_reg (mode, value);
1813   else
1814     {
1815       x = aarch64_emit_move (x, value);
1816       return x;
1817     }
1818 }
1819
1820
1821 static rtx
1822 aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg,
1823                     HOST_WIDE_INT offset)
1824 {
1825   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1826     {
1827       rtx high;
1828       /* Load the full offset into a register.  This
1829          might be improvable in the future.  */
1830       high = GEN_INT (offset);
1831       offset = 0;
1832       high = aarch64_force_temporary (mode, temp, high);
1833       reg = aarch64_force_temporary (mode, temp,
1834                                      gen_rtx_PLUS (mode, high, reg));
1835     }
1836   return plus_constant (mode, reg, offset);
1837 }
1838
1839 static int
1840 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1841                                 scalar_int_mode mode)
1842 {
1843   int i;
1844   unsigned HOST_WIDE_INT val, val2, mask;
1845   int one_match, zero_match;
1846   int num_insns;
1847
1848   val = INTVAL (imm);
1849
1850   if (aarch64_move_imm (val, mode))
1851     {
1852       if (generate)
1853         emit_insn (gen_rtx_SET (dest, imm));
1854       return 1;
1855     }
1856
1857   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1858      (with XXXX non-zero). In that case check to see if the move can be done in
1859      a smaller mode.  */
1860   val2 = val & 0xffffffff;
1861   if (mode == DImode
1862       && aarch64_move_imm (val2, SImode)
1863       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1864     {
1865       if (generate)
1866         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1867
1868       /* Check if we have to emit a second instruction by checking to see
1869          if any of the upper 32 bits of the original DI mode value is set.  */
1870       if (val == val2)
1871         return 1;
1872
1873       i = (val >> 48) ? 48 : 32;
1874
1875       if (generate)
1876          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1877                                     GEN_INT ((val >> i) & 0xffff)));
1878
1879       return 2;
1880     }
1881
1882   if ((val >> 32) == 0 || mode == SImode)
1883     {
1884       if (generate)
1885         {
1886           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1887           if (mode == SImode)
1888             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1889                                        GEN_INT ((val >> 16) & 0xffff)));
1890           else
1891             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1892                                        GEN_INT ((val >> 16) & 0xffff)));
1893         }
1894       return 2;
1895     }
1896
1897   /* Remaining cases are all for DImode.  */
1898
1899   mask = 0xffff;
1900   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1901     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1902   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1903     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1904
1905   if (zero_match != 2 && one_match != 2)
1906     {
1907       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1908          For a 64-bit bitmask try whether changing 16 bits to all ones or
1909          zeroes creates a valid bitmask.  To check any repeated bitmask,
1910          try using 16 bits from the other 32-bit half of val.  */
1911
1912       for (i = 0; i < 64; i += 16, mask <<= 16)
1913         {
1914           val2 = val & ~mask;
1915           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1916             break;
1917           val2 = val | mask;
1918           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1919             break;
1920           val2 = val2 & ~mask;
1921           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1922           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1923             break;
1924         }
1925       if (i != 64)
1926         {
1927           if (generate)
1928             {
1929               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1930               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1931                                          GEN_INT ((val >> i) & 0xffff)));
1932             }
1933           return 2;
1934         }
1935     }
1936
1937   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1938      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1939      otherwise skip zero bits.  */
1940
1941   num_insns = 1;
1942   mask = 0xffff;
1943   val2 = one_match > zero_match ? ~val : val;
1944   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1945
1946   if (generate)
1947     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1948                                            ? (val | ~(mask << i))
1949                                            : (val & (mask << i)))));
1950   for (i += 16; i < 64; i += 16)
1951     {
1952       if ((val2 & (mask << i)) == 0)
1953         continue;
1954       if (generate)
1955         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1956                                    GEN_INT ((val >> i) & 0xffff)));
1957       num_insns ++;
1958     }
1959
1960   return num_insns;
1961 }
1962
1963
1964 void
1965 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1966 {
1967   machine_mode mode = GET_MODE (dest);
1968
1969   gcc_assert (mode == SImode || mode == DImode);
1970
1971   /* Check on what type of symbol it is.  */
1972   scalar_int_mode int_mode;
1973   if ((GET_CODE (imm) == SYMBOL_REF
1974        || GET_CODE (imm) == LABEL_REF
1975        || GET_CODE (imm) == CONST)
1976       && is_a <scalar_int_mode> (mode, &int_mode))
1977     {
1978       rtx mem, base, offset;
1979       enum aarch64_symbol_type sty;
1980
1981       /* If we have (const (plus symbol offset)), separate out the offset
1982          before we start classifying the symbol.  */
1983       split_const (imm, &base, &offset);
1984
1985       sty = aarch64_classify_symbol (base, offset);
1986       switch (sty)
1987         {
1988         case SYMBOL_FORCE_TO_MEM:
1989           if (offset != const0_rtx
1990               && targetm.cannot_force_const_mem (int_mode, imm))
1991             {
1992               gcc_assert (can_create_pseudo_p ());
1993               base = aarch64_force_temporary (int_mode, dest, base);
1994               base = aarch64_add_offset (int_mode, NULL, base,
1995                                          INTVAL (offset));
1996               aarch64_emit_move (dest, base);
1997               return;
1998             }
1999
2000           mem = force_const_mem (ptr_mode, imm);
2001           gcc_assert (mem);
2002
2003           /* If we aren't generating PC relative literals, then
2004              we need to expand the literal pool access carefully.
2005              This is something that needs to be done in a number
2006              of places, so could well live as a separate function.  */
2007           if (!aarch64_pcrelative_literal_loads)
2008             {
2009               gcc_assert (can_create_pseudo_p ());
2010               base = gen_reg_rtx (ptr_mode);
2011               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2012               if (ptr_mode != Pmode)
2013                 base = convert_memory_address (Pmode, base);
2014               mem = gen_rtx_MEM (ptr_mode, base);
2015             }
2016
2017           if (int_mode != ptr_mode)
2018             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2019
2020           emit_insn (gen_rtx_SET (dest, mem));
2021
2022           return;
2023
2024         case SYMBOL_SMALL_TLSGD:
2025         case SYMBOL_SMALL_TLSDESC:
2026         case SYMBOL_SMALL_TLSIE:
2027         case SYMBOL_SMALL_GOT_28K:
2028         case SYMBOL_SMALL_GOT_4G:
2029         case SYMBOL_TINY_GOT:
2030         case SYMBOL_TINY_TLSIE:
2031           if (offset != const0_rtx)
2032             {
2033               gcc_assert(can_create_pseudo_p ());
2034               base = aarch64_force_temporary (int_mode, dest, base);
2035               base = aarch64_add_offset (int_mode, NULL, base,
2036                                          INTVAL (offset));
2037               aarch64_emit_move (dest, base);
2038               return;
2039             }
2040           /* FALLTHRU */
2041
2042         case SYMBOL_SMALL_ABSOLUTE:
2043         case SYMBOL_TINY_ABSOLUTE:
2044         case SYMBOL_TLSLE12:
2045         case SYMBOL_TLSLE24:
2046         case SYMBOL_TLSLE32:
2047         case SYMBOL_TLSLE48:
2048           aarch64_load_symref_appropriately (dest, imm, sty);
2049           return;
2050
2051         default:
2052           gcc_unreachable ();
2053         }
2054     }
2055
2056   if (!CONST_INT_P (imm))
2057     {
2058       if (GET_CODE (imm) == HIGH)
2059         emit_insn (gen_rtx_SET (dest, imm));
2060       else
2061         {
2062           rtx mem = force_const_mem (mode, imm);
2063           gcc_assert (mem);
2064           emit_insn (gen_rtx_SET (dest, mem));
2065         }
2066
2067       return;
2068     }
2069
2070   aarch64_internal_mov_immediate (dest, imm, true,
2071                                   as_a <scalar_int_mode> (mode));
2072 }
2073
2074 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2075    temporary value if necessary.  FRAME_RELATED_P should be true if
2076    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2077    to the generated instructions.  If SCRATCHREG is known to hold
2078    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2079    immediate again.
2080
2081    Since this function may be used to adjust the stack pointer, we must
2082    ensure that it cannot cause transient stack deallocation (for example
2083    by first incrementing SP and then decrementing when adjusting by a
2084    large immediate).  */
2085
2086 static void
2087 aarch64_add_constant_internal (scalar_int_mode mode, int regnum,
2088                                int scratchreg, HOST_WIDE_INT delta,
2089                                bool frame_related_p, bool emit_move_imm)
2090 {
2091   HOST_WIDE_INT mdelta = abs_hwi (delta);
2092   rtx this_rtx = gen_rtx_REG (mode, regnum);
2093   rtx_insn *insn;
2094
2095   if (!mdelta)
2096     return;
2097
2098   /* Single instruction adjustment.  */
2099   if (aarch64_uimm12_shift (mdelta))
2100     {
2101       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2102       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2103       return;
2104     }
2105
2106   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2107      Only do this if mdelta is not a 16-bit move as adjusting using a move
2108      is better.  */
2109   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2110     {
2111       HOST_WIDE_INT low_off = mdelta & 0xfff;
2112
2113       low_off = delta < 0 ? -low_off : low_off;
2114       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2115       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2116       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2117       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2118       return;
2119     }
2120
2121   /* Emit a move immediate if required and an addition/subtraction.  */
2122   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2123   if (emit_move_imm)
2124     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2125   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2126                               : gen_add2_insn (this_rtx, scratch_rtx));
2127   if (frame_related_p)
2128     {
2129       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2130       rtx adj = plus_constant (mode, this_rtx, delta);
2131       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2132     }
2133 }
2134
2135 static inline void
2136 aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg,
2137                       HOST_WIDE_INT delta)
2138 {
2139   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2140 }
2141
2142 static inline void
2143 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2144 {
2145   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2146                                  true, emit_move_imm);
2147 }
2148
2149 static inline void
2150 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2151 {
2152   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2153                                  frame_related_p, true);
2154 }
2155
2156 static bool
2157 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2158                                  tree exp ATTRIBUTE_UNUSED)
2159 {
2160   /* Currently, always true.  */
2161   return true;
2162 }
2163
2164 /* Implement TARGET_PASS_BY_REFERENCE.  */
2165
2166 static bool
2167 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2168                            machine_mode mode,
2169                            const_tree type,
2170                            bool named ATTRIBUTE_UNUSED)
2171 {
2172   HOST_WIDE_INT size;
2173   machine_mode dummymode;
2174   int nregs;
2175
2176   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2177   size = (mode == BLKmode && type)
2178     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2179
2180   /* Aggregates are passed by reference based on their size.  */
2181   if (type && AGGREGATE_TYPE_P (type))
2182     {
2183       size = int_size_in_bytes (type);
2184     }
2185
2186   /* Variable sized arguments are always returned by reference.  */
2187   if (size < 0)
2188     return true;
2189
2190   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2191   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2192                                                &dummymode, &nregs,
2193                                                NULL))
2194     return false;
2195
2196   /* Arguments which are variable sized or larger than 2 registers are
2197      passed by reference unless they are a homogenous floating point
2198      aggregate.  */
2199   return size > 2 * UNITS_PER_WORD;
2200 }
2201
2202 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2203 static bool
2204 aarch64_return_in_msb (const_tree valtype)
2205 {
2206   machine_mode dummy_mode;
2207   int dummy_int;
2208
2209   /* Never happens in little-endian mode.  */
2210   if (!BYTES_BIG_ENDIAN)
2211     return false;
2212
2213   /* Only composite types smaller than or equal to 16 bytes can
2214      be potentially returned in registers.  */
2215   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2216       || int_size_in_bytes (valtype) <= 0
2217       || int_size_in_bytes (valtype) > 16)
2218     return false;
2219
2220   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2221      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2222      is always passed/returned in the least significant bits of fp/simd
2223      register(s).  */
2224   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2225                                                &dummy_mode, &dummy_int, NULL))
2226     return false;
2227
2228   return true;
2229 }
2230
2231 /* Implement TARGET_FUNCTION_VALUE.
2232    Define how to find the value returned by a function.  */
2233
2234 static rtx
2235 aarch64_function_value (const_tree type, const_tree func,
2236                         bool outgoing ATTRIBUTE_UNUSED)
2237 {
2238   machine_mode mode;
2239   int unsignedp;
2240   int count;
2241   machine_mode ag_mode;
2242
2243   mode = TYPE_MODE (type);
2244   if (INTEGRAL_TYPE_P (type))
2245     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2246
2247   if (aarch64_return_in_msb (type))
2248     {
2249       HOST_WIDE_INT size = int_size_in_bytes (type);
2250
2251       if (size % UNITS_PER_WORD != 0)
2252         {
2253           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2254           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
2255         }
2256     }
2257
2258   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2259                                                &ag_mode, &count, NULL))
2260     {
2261       if (!aarch64_composite_type_p (type, mode))
2262         {
2263           gcc_assert (count == 1 && mode == ag_mode);
2264           return gen_rtx_REG (mode, V0_REGNUM);
2265         }
2266       else
2267         {
2268           int i;
2269           rtx par;
2270
2271           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2272           for (i = 0; i < count; i++)
2273             {
2274               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2275               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2276                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2277               XVECEXP (par, 0, i) = tmp;
2278             }
2279           return par;
2280         }
2281     }
2282   else
2283     return gen_rtx_REG (mode, R0_REGNUM);
2284 }
2285
2286 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2287    Return true if REGNO is the number of a hard register in which the values
2288    of called function may come back.  */
2289
2290 static bool
2291 aarch64_function_value_regno_p (const unsigned int regno)
2292 {
2293   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2294      of 16-byte return values are: 128-bit integers and 16-byte small
2295      structures (excluding homogeneous floating-point aggregates).  */
2296   if (regno == R0_REGNUM || regno == R1_REGNUM)
2297     return true;
2298
2299   /* Up to four fp/simd registers can return a function value, e.g. a
2300      homogeneous floating-point aggregate having four members.  */
2301   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2302     return TARGET_FLOAT;
2303
2304   return false;
2305 }
2306
2307 /* Implement TARGET_RETURN_IN_MEMORY.
2308
2309    If the type T of the result of a function is such that
2310      void func (T arg)
2311    would require that arg be passed as a value in a register (or set of
2312    registers) according to the parameter passing rules, then the result
2313    is returned in the same registers as would be used for such an
2314    argument.  */
2315
2316 static bool
2317 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2318 {
2319   HOST_WIDE_INT size;
2320   machine_mode ag_mode;
2321   int count;
2322
2323   if (!AGGREGATE_TYPE_P (type)
2324       && TREE_CODE (type) != COMPLEX_TYPE
2325       && TREE_CODE (type) != VECTOR_TYPE)
2326     /* Simple scalar types always returned in registers.  */
2327     return false;
2328
2329   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2330                                                type,
2331                                                &ag_mode,
2332                                                &count,
2333                                                NULL))
2334     return false;
2335
2336   /* Types larger than 2 registers returned in memory.  */
2337   size = int_size_in_bytes (type);
2338   return (size < 0 || size > 2 * UNITS_PER_WORD);
2339 }
2340
2341 static bool
2342 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2343                                const_tree type, int *nregs)
2344 {
2345   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2346   return aarch64_vfp_is_call_or_return_candidate (mode,
2347                                                   type,
2348                                                   &pcum->aapcs_vfp_rmode,
2349                                                   nregs,
2350                                                   NULL);
2351 }
2352
2353 /* Given MODE and TYPE of a function argument, return the alignment in
2354    bits.  The idea is to suppress any stronger alignment requested by
2355    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2356    This is a helper function for local use only.  */
2357
2358 static unsigned int
2359 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2360 {
2361   if (!type)
2362     return GET_MODE_ALIGNMENT (mode);
2363
2364   if (integer_zerop (TYPE_SIZE (type)))
2365     return 0;
2366
2367   gcc_assert (TYPE_MODE (type) == mode);
2368
2369   if (!AGGREGATE_TYPE_P (type))
2370     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2371
2372   if (TREE_CODE (type) == ARRAY_TYPE)
2373     return TYPE_ALIGN (TREE_TYPE (type));
2374
2375   unsigned int alignment = 0;
2376   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2377     if (TREE_CODE (field) == FIELD_DECL)
2378       alignment = std::max (alignment, DECL_ALIGN (field));
2379
2380   return alignment;
2381 }
2382
2383 /* Layout a function argument according to the AAPCS64 rules.  The rule
2384    numbers refer to the rule numbers in the AAPCS64.  */
2385
2386 static void
2387 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2388                     const_tree type,
2389                     bool named ATTRIBUTE_UNUSED)
2390 {
2391   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2392   int ncrn, nvrn, nregs;
2393   bool allocate_ncrn, allocate_nvrn;
2394   HOST_WIDE_INT size;
2395
2396   /* We need to do this once per argument.  */
2397   if (pcum->aapcs_arg_processed)
2398     return;
2399
2400   pcum->aapcs_arg_processed = true;
2401
2402   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2403   size
2404     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2405                 UNITS_PER_WORD);
2406
2407   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2408   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2409                                                  mode,
2410                                                  type,
2411                                                  &nregs);
2412
2413   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2414      The following code thus handles passing by SIMD/FP registers first.  */
2415
2416   nvrn = pcum->aapcs_nvrn;
2417
2418   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2419      and homogenous short-vector aggregates (HVA).  */
2420   if (allocate_nvrn)
2421     {
2422       if (!TARGET_FLOAT)
2423         aarch64_err_no_fpadvsimd (mode, "argument");
2424
2425       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2426         {
2427           pcum->aapcs_nextnvrn = nvrn + nregs;
2428           if (!aarch64_composite_type_p (type, mode))
2429             {
2430               gcc_assert (nregs == 1);
2431               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2432             }
2433           else
2434             {
2435               rtx par;
2436               int i;
2437               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2438               for (i = 0; i < nregs; i++)
2439                 {
2440                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2441                                          V0_REGNUM + nvrn + i);
2442                   tmp = gen_rtx_EXPR_LIST
2443                     (VOIDmode, tmp,
2444                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2445                   XVECEXP (par, 0, i) = tmp;
2446                 }
2447               pcum->aapcs_reg = par;
2448             }
2449           return;
2450         }
2451       else
2452         {
2453           /* C.3 NSRN is set to 8.  */
2454           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2455           goto on_stack;
2456         }
2457     }
2458
2459   ncrn = pcum->aapcs_ncrn;
2460   nregs = size / UNITS_PER_WORD;
2461
2462   /* C6 - C9.  though the sign and zero extension semantics are
2463      handled elsewhere.  This is the case where the argument fits
2464      entirely general registers.  */
2465   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2466     {
2467
2468       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2469
2470       /* C.8 if the argument has an alignment of 16 then the NGRN is
2471          rounded up to the next even number.  */
2472       if (nregs == 2
2473           && ncrn % 2
2474           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2475              comparison is there because for > 16 * BITS_PER_UNIT
2476              alignment nregs should be > 2 and therefore it should be
2477              passed by reference rather than value.  */
2478           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2479         {
2480           ++ncrn;
2481           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2482         }
2483
2484       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2485          A reg is still generated for it, but the caller should be smart
2486          enough not to use it.  */
2487       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2488         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2489       else
2490         {
2491           rtx par;
2492           int i;
2493
2494           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2495           for (i = 0; i < nregs; i++)
2496             {
2497               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2498               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2499                                        GEN_INT (i * UNITS_PER_WORD));
2500               XVECEXP (par, 0, i) = tmp;
2501             }
2502           pcum->aapcs_reg = par;
2503         }
2504
2505       pcum->aapcs_nextncrn = ncrn + nregs;
2506       return;
2507     }
2508
2509   /* C.11  */
2510   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2511
2512   /* The argument is passed on stack; record the needed number of words for
2513      this argument and align the total size if necessary.  */
2514 on_stack:
2515   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2516
2517   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2518     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2519                                        16 / UNITS_PER_WORD);
2520   return;
2521 }
2522
2523 /* Implement TARGET_FUNCTION_ARG.  */
2524
2525 static rtx
2526 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2527                       const_tree type, bool named)
2528 {
2529   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2530   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2531
2532   if (mode == VOIDmode)
2533     return NULL_RTX;
2534
2535   aarch64_layout_arg (pcum_v, mode, type, named);
2536   return pcum->aapcs_reg;
2537 }
2538
2539 void
2540 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2541                            const_tree fntype ATTRIBUTE_UNUSED,
2542                            rtx libname ATTRIBUTE_UNUSED,
2543                            const_tree fndecl ATTRIBUTE_UNUSED,
2544                            unsigned n_named ATTRIBUTE_UNUSED)
2545 {
2546   pcum->aapcs_ncrn = 0;
2547   pcum->aapcs_nvrn = 0;
2548   pcum->aapcs_nextncrn = 0;
2549   pcum->aapcs_nextnvrn = 0;
2550   pcum->pcs_variant = ARM_PCS_AAPCS64;
2551   pcum->aapcs_reg = NULL_RTX;
2552   pcum->aapcs_arg_processed = false;
2553   pcum->aapcs_stack_words = 0;
2554   pcum->aapcs_stack_size = 0;
2555
2556   if (!TARGET_FLOAT
2557       && fndecl && TREE_PUBLIC (fndecl)
2558       && fntype && fntype != error_mark_node)
2559     {
2560       const_tree type = TREE_TYPE (fntype);
2561       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2562       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2563       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2564                                                    &mode, &nregs, NULL))
2565         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2566     }
2567   return;
2568 }
2569
2570 static void
2571 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2572                               machine_mode mode,
2573                               const_tree type,
2574                               bool named)
2575 {
2576   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2577   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2578     {
2579       aarch64_layout_arg (pcum_v, mode, type, named);
2580       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2581                   != (pcum->aapcs_stack_words != 0));
2582       pcum->aapcs_arg_processed = false;
2583       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2584       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2585       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2586       pcum->aapcs_stack_words = 0;
2587       pcum->aapcs_reg = NULL_RTX;
2588     }
2589 }
2590
2591 bool
2592 aarch64_function_arg_regno_p (unsigned regno)
2593 {
2594   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2595           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2596 }
2597
2598 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2599    PARM_BOUNDARY bits of alignment, but will be given anything up
2600    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2601    that both before and after the layout of each argument, the Next
2602    Stacked Argument Address (NSAA) will have a minimum alignment of
2603    8 bytes.  */
2604
2605 static unsigned int
2606 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2607 {
2608   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2609   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2610 }
2611
2612 /* Implement TARGET_FUNCTION_ARG_PADDING.
2613
2614    Small aggregate types are placed in the lowest memory address.
2615
2616    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2617
2618 static pad_direction
2619 aarch64_function_arg_padding (machine_mode mode, const_tree type)
2620 {
2621   /* On little-endian targets, the least significant byte of every stack
2622      argument is passed at the lowest byte address of the stack slot.  */
2623   if (!BYTES_BIG_ENDIAN)
2624     return PAD_UPWARD;
2625
2626   /* Otherwise, integral, floating-point and pointer types are padded downward:
2627      the least significant byte of a stack argument is passed at the highest
2628      byte address of the stack slot.  */
2629   if (type
2630       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2631          || POINTER_TYPE_P (type))
2632       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2633     return PAD_DOWNWARD;
2634
2635   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2636   return PAD_UPWARD;
2637 }
2638
2639 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2640
2641    It specifies padding for the last (may also be the only)
2642    element of a block move between registers and memory.  If
2643    assuming the block is in the memory, padding upward means that
2644    the last element is padded after its highest significant byte,
2645    while in downward padding, the last element is padded at the
2646    its least significant byte side.
2647
2648    Small aggregates and small complex types are always padded
2649    upwards.
2650
2651    We don't need to worry about homogeneous floating-point or
2652    short-vector aggregates; their move is not affected by the
2653    padding direction determined here.  Regardless of endianness,
2654    each element of such an aggregate is put in the least
2655    significant bits of a fp/simd register.
2656
2657    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2658    register has useful data, and return the opposite if the most
2659    significant byte does.  */
2660
2661 bool
2662 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2663                      bool first ATTRIBUTE_UNUSED)
2664 {
2665
2666   /* Small composite types are always padded upward.  */
2667   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2668     {
2669       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2670                             : GET_MODE_SIZE (mode));
2671       if (size < 2 * UNITS_PER_WORD)
2672         return true;
2673     }
2674
2675   /* Otherwise, use the default padding.  */
2676   return !BYTES_BIG_ENDIAN;
2677 }
2678
2679 static scalar_int_mode
2680 aarch64_libgcc_cmp_return_mode (void)
2681 {
2682   return SImode;
2683 }
2684
2685 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2686
2687 /* We use the 12-bit shifted immediate arithmetic instructions so values
2688    must be multiple of (1 << 12), i.e. 4096.  */
2689 #define ARITH_FACTOR 4096
2690
2691 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2692 #error Cannot use simple address calculation for stack probing
2693 #endif
2694
2695 /* The pair of scratch registers used for stack probing.  */
2696 #define PROBE_STACK_FIRST_REG  9
2697 #define PROBE_STACK_SECOND_REG 10
2698
2699 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2700    inclusive.  These are offsets from the current stack pointer.  */
2701
2702 static void
2703 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2704 {
2705   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2706
2707   /* See the same assertion on PROBE_INTERVAL above.  */
2708   gcc_assert ((first % ARITH_FACTOR) == 0);
2709
2710   /* See if we have a constant small number of probes to generate.  If so,
2711      that's the easy case.  */
2712   if (size <= PROBE_INTERVAL)
2713     {
2714       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2715
2716       emit_set_insn (reg1,
2717                      plus_constant (Pmode,
2718                                     stack_pointer_rtx, -(first + base)));
2719       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2720     }
2721
2722   /* The run-time loop is made up of 8 insns in the generic case while the
2723      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2724   else if (size <= 4 * PROBE_INTERVAL)
2725     {
2726       HOST_WIDE_INT i, rem;
2727
2728       emit_set_insn (reg1,
2729                      plus_constant (Pmode,
2730                                     stack_pointer_rtx,
2731                                     -(first + PROBE_INTERVAL)));
2732       emit_stack_probe (reg1);
2733
2734       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2735          it exceeds SIZE.  If only two probes are needed, this will not
2736          generate any code.  Then probe at FIRST + SIZE.  */
2737       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2738         {
2739           emit_set_insn (reg1,
2740                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2741           emit_stack_probe (reg1);
2742         }
2743
2744       rem = size - (i - PROBE_INTERVAL);
2745       if (rem > 256)
2746         {
2747           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2748
2749           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2750           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2751         }
2752       else
2753         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2754     }
2755
2756   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2757      extra careful with variables wrapping around because we might be at
2758      the very top (or the very bottom) of the address space and we have
2759      to be able to handle this case properly; in particular, we use an
2760      equality test for the loop condition.  */
2761   else
2762     {
2763       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2764
2765       /* Step 1: round SIZE to the previous multiple of the interval.  */
2766
2767       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2768
2769
2770       /* Step 2: compute initial and final value of the loop counter.  */
2771
2772       /* TEST_ADDR = SP + FIRST.  */
2773       emit_set_insn (reg1,
2774                      plus_constant (Pmode, stack_pointer_rtx, -first));
2775
2776       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2777       HOST_WIDE_INT adjustment = - (first + rounded_size);
2778       if (! aarch64_uimm12_shift (adjustment))
2779         {
2780           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2781                                           true, Pmode);
2782           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2783         }
2784       else
2785         {
2786           emit_set_insn (reg2,
2787                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2788         }
2789
2790       /* Step 3: the loop
2791
2792          do
2793            {
2794              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2795              probe at TEST_ADDR
2796            }
2797          while (TEST_ADDR != LAST_ADDR)
2798
2799          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2800          until it is equal to ROUNDED_SIZE.  */
2801
2802       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2803
2804
2805       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2806          that SIZE is equal to ROUNDED_SIZE.  */
2807
2808       if (size != rounded_size)
2809         {
2810           HOST_WIDE_INT rem = size - rounded_size;
2811
2812           if (rem > 256)
2813             {
2814               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2815
2816               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2817               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2818             }
2819           else
2820             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2821         }
2822     }
2823
2824   /* Make sure nothing is scheduled before we are done.  */
2825   emit_insn (gen_blockage ());
2826 }
2827
2828 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2829    absolute addresses.  */
2830
2831 const char *
2832 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2833 {
2834   static int labelno = 0;
2835   char loop_lab[32];
2836   rtx xops[2];
2837
2838   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2839
2840   /* Loop.  */
2841   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2842
2843   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2844   xops[0] = reg1;
2845   xops[1] = GEN_INT (PROBE_INTERVAL);
2846   output_asm_insn ("sub\t%0, %0, %1", xops);
2847
2848   /* Probe at TEST_ADDR.  */
2849   output_asm_insn ("str\txzr, [%0]", xops);
2850
2851   /* Test if TEST_ADDR == LAST_ADDR.  */
2852   xops[1] = reg2;
2853   output_asm_insn ("cmp\t%0, %1", xops);
2854
2855   /* Branch.  */
2856   fputs ("\tb.ne\t", asm_out_file);
2857   assemble_name_raw (asm_out_file, loop_lab);
2858   fputc ('\n', asm_out_file);
2859
2860   return "";
2861 }
2862
2863 static bool
2864 aarch64_frame_pointer_required (void)
2865 {
2866   /* In aarch64_override_options_after_change
2867      flag_omit_leaf_frame_pointer turns off the frame pointer by
2868      default.  Turn it back on now if we've not got a leaf
2869      function.  */
2870   if (flag_omit_leaf_frame_pointer
2871       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2872     return true;
2873
2874   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2875   if (crtl->calls_eh_return)
2876     return true;
2877
2878   return false;
2879 }
2880
2881 /* Mark the registers that need to be saved by the callee and calculate
2882    the size of the callee-saved registers area and frame record (both FP
2883    and LR may be omitted).  */
2884 static void
2885 aarch64_layout_frame (void)
2886 {
2887   HOST_WIDE_INT offset = 0;
2888   int regno, last_fp_reg = INVALID_REGNUM;
2889
2890   if (reload_completed && cfun->machine->frame.laid_out)
2891     return;
2892
2893 #define SLOT_NOT_REQUIRED (-2)
2894 #define SLOT_REQUIRED     (-1)
2895
2896   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2897   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2898
2899   /* First mark all the registers that really need to be saved...  */
2900   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2901     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2902
2903   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2904     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2905
2906   /* ... that includes the eh data registers (if needed)...  */
2907   if (crtl->calls_eh_return)
2908     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2909       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2910         = SLOT_REQUIRED;
2911
2912   /* ... and any callee saved register that dataflow says is live.  */
2913   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2914     if (df_regs_ever_live_p (regno)
2915         && (regno == R30_REGNUM
2916             || !call_used_regs[regno]))
2917       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2918
2919   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2920     if (df_regs_ever_live_p (regno)
2921         && !call_used_regs[regno])
2922       {
2923         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2924         last_fp_reg = regno;
2925       }
2926
2927   if (frame_pointer_needed)
2928     {
2929       /* FP and LR are placed in the linkage record.  */
2930       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2931       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2932       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2933       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2934       offset += 2 * UNITS_PER_WORD;
2935     }
2936
2937   /* Now assign stack slots for them.  */
2938   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2939     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2940       {
2941         cfun->machine->frame.reg_offset[regno] = offset;
2942         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2943           cfun->machine->frame.wb_candidate1 = regno;
2944         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2945           cfun->machine->frame.wb_candidate2 = regno;
2946         offset += UNITS_PER_WORD;
2947       }
2948
2949   HOST_WIDE_INT max_int_offset = offset;
2950   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2951   bool has_align_gap = offset != max_int_offset;
2952
2953   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2954     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2955       {
2956         /* If there is an alignment gap between integer and fp callee-saves,
2957            allocate the last fp register to it if possible.  */
2958         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2959           {
2960             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2961             break;
2962           }
2963
2964         cfun->machine->frame.reg_offset[regno] = offset;
2965         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2966           cfun->machine->frame.wb_candidate1 = regno;
2967         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2968                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2969           cfun->machine->frame.wb_candidate2 = regno;
2970         offset += UNITS_PER_WORD;
2971       }
2972
2973   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2974
2975   cfun->machine->frame.saved_regs_size = offset;
2976
2977   HOST_WIDE_INT varargs_and_saved_regs_size
2978     = offset + cfun->machine->frame.saved_varargs_size;
2979
2980   cfun->machine->frame.hard_fp_offset
2981     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2982                 STACK_BOUNDARY / BITS_PER_UNIT);
2983
2984   cfun->machine->frame.frame_size
2985     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2986                 + crtl->outgoing_args_size,
2987                 STACK_BOUNDARY / BITS_PER_UNIT);
2988
2989   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2990
2991   cfun->machine->frame.initial_adjust = 0;
2992   cfun->machine->frame.final_adjust = 0;
2993   cfun->machine->frame.callee_adjust = 0;
2994   cfun->machine->frame.callee_offset = 0;
2995
2996   HOST_WIDE_INT max_push_offset = 0;
2997   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2998     max_push_offset = 512;
2999   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3000     max_push_offset = 256;
3001
3002   if (cfun->machine->frame.frame_size < max_push_offset
3003       && crtl->outgoing_args_size == 0)
3004     {
3005       /* Simple, small frame with no outgoing arguments:
3006          stp reg1, reg2, [sp, -frame_size]!
3007          stp reg3, reg4, [sp, 16]  */
3008       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
3009     }
3010   else if ((crtl->outgoing_args_size
3011             + cfun->machine->frame.saved_regs_size < 512)
3012            && !(cfun->calls_alloca
3013                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
3014     {
3015       /* Frame with small outgoing arguments:
3016          sub sp, sp, frame_size
3017          stp reg1, reg2, [sp, outgoing_args_size]
3018          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3019       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3020       cfun->machine->frame.callee_offset
3021         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3022     }
3023   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3024     {
3025       /* Frame with large outgoing arguments but a small local area:
3026          stp reg1, reg2, [sp, -hard_fp_offset]!
3027          stp reg3, reg4, [sp, 16]
3028          sub sp, sp, outgoing_args_size  */
3029       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3030       cfun->machine->frame.final_adjust
3031         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3032     }
3033   else if (!frame_pointer_needed
3034            && varargs_and_saved_regs_size < max_push_offset)
3035     {
3036       /* Frame with large local area and outgoing arguments (this pushes the
3037          callee-saves first, followed by the locals and outgoing area):
3038          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3039          stp reg3, reg4, [sp, 16]
3040          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3041       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3042       cfun->machine->frame.final_adjust
3043         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3044       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3045       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3046     }
3047   else
3048     {
3049       /* Frame with large local area and outgoing arguments using frame pointer:
3050          sub sp, sp, hard_fp_offset
3051          stp x29, x30, [sp, 0]
3052          add x29, sp, 0
3053          stp reg3, reg4, [sp, 16]
3054          sub sp, sp, outgoing_args_size  */
3055       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3056       cfun->machine->frame.final_adjust
3057         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3058     }
3059
3060   cfun->machine->frame.laid_out = true;
3061 }
3062
3063 /* Return true if the register REGNO is saved on entry to
3064    the current function.  */
3065
3066 static bool
3067 aarch64_register_saved_on_entry (int regno)
3068 {
3069   return cfun->machine->frame.reg_offset[regno] >= 0;
3070 }
3071
3072 /* Return the next register up from REGNO up to LIMIT for the callee
3073    to save.  */
3074
3075 static unsigned
3076 aarch64_next_callee_save (unsigned regno, unsigned limit)
3077 {
3078   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3079     regno ++;
3080   return regno;
3081 }
3082
3083 /* Push the register number REGNO of mode MODE to the stack with write-back
3084    adjusting the stack by ADJUSTMENT.  */
3085
3086 static void
3087 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3088                            HOST_WIDE_INT adjustment)
3089  {
3090   rtx base_rtx = stack_pointer_rtx;
3091   rtx insn, reg, mem;
3092
3093   reg = gen_rtx_REG (mode, regno);
3094   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3095                             plus_constant (Pmode, base_rtx, -adjustment));
3096   mem = gen_frame_mem (mode, mem);
3097
3098   insn = emit_move_insn (mem, reg);
3099   RTX_FRAME_RELATED_P (insn) = 1;
3100 }
3101
3102 /* Generate and return an instruction to store the pair of registers
3103    REG and REG2 of mode MODE to location BASE with write-back adjusting
3104    the stack location BASE by ADJUSTMENT.  */
3105
3106 static rtx
3107 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3108                           HOST_WIDE_INT adjustment)
3109 {
3110   switch (mode)
3111     {
3112     case E_DImode:
3113       return gen_storewb_pairdi_di (base, base, reg, reg2,
3114                                     GEN_INT (-adjustment),
3115                                     GEN_INT (UNITS_PER_WORD - adjustment));
3116     case E_DFmode:
3117       return gen_storewb_pairdf_di (base, base, reg, reg2,
3118                                     GEN_INT (-adjustment),
3119                                     GEN_INT (UNITS_PER_WORD - adjustment));
3120     default:
3121       gcc_unreachable ();
3122     }
3123 }
3124
3125 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3126    stack pointer by ADJUSTMENT.  */
3127
3128 static void
3129 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3130 {
3131   rtx_insn *insn;
3132   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3133
3134   if (regno2 == INVALID_REGNUM)
3135     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3136
3137   rtx reg1 = gen_rtx_REG (mode, regno1);
3138   rtx reg2 = gen_rtx_REG (mode, regno2);
3139
3140   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3141                                               reg2, adjustment));
3142   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3143   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3144   RTX_FRAME_RELATED_P (insn) = 1;
3145 }
3146
3147 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3148    adjusting it by ADJUSTMENT afterwards.  */
3149
3150 static rtx
3151 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3152                          HOST_WIDE_INT adjustment)
3153 {
3154   switch (mode)
3155     {
3156     case E_DImode:
3157       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3158                                    GEN_INT (UNITS_PER_WORD));
3159     case E_DFmode:
3160       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3161                                    GEN_INT (UNITS_PER_WORD));
3162     default:
3163       gcc_unreachable ();
3164     }
3165 }
3166
3167 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3168    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3169    into CFI_OPS.  */
3170
3171 static void
3172 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3173                   rtx *cfi_ops)
3174 {
3175   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3176   rtx reg1 = gen_rtx_REG (mode, regno1);
3177
3178   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3179
3180   if (regno2 == INVALID_REGNUM)
3181     {
3182       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3183       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3184       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3185     }
3186   else
3187     {
3188       rtx reg2 = gen_rtx_REG (mode, regno2);
3189       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3190       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3191                                           reg2, adjustment));
3192     }
3193 }
3194
3195 /* Generate and return a store pair instruction of mode MODE to store
3196    register REG1 to MEM1 and register REG2 to MEM2.  */
3197
3198 static rtx
3199 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3200                         rtx reg2)
3201 {
3202   switch (mode)
3203     {
3204     case E_DImode:
3205       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3206
3207     case E_DFmode:
3208       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3209
3210     default:
3211       gcc_unreachable ();
3212     }
3213 }
3214
3215 /* Generate and regurn a load pair isntruction of mode MODE to load register
3216    REG1 from MEM1 and register REG2 from MEM2.  */
3217
3218 static rtx
3219 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3220                        rtx mem2)
3221 {
3222   switch (mode)
3223     {
3224     case E_DImode:
3225       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3226
3227     case E_DFmode:
3228       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3229
3230     default:
3231       gcc_unreachable ();
3232     }
3233 }
3234
3235 /* Return TRUE if return address signing should be enabled for the current
3236    function, otherwise return FALSE.  */
3237
3238 bool
3239 aarch64_return_address_signing_enabled (void)
3240 {
3241   /* This function should only be called after frame laid out.   */
3242   gcc_assert (cfun->machine->frame.laid_out);
3243
3244   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3245      if it's LR is pushed onto stack.  */
3246   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3247           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3248               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3249 }
3250
3251 /* Emit code to save the callee-saved registers from register number START
3252    to LIMIT to the stack at the location starting at offset START_OFFSET,
3253    skipping any write-back candidates if SKIP_WB is true.  */
3254
3255 static void
3256 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3257                            unsigned start, unsigned limit, bool skip_wb)
3258 {
3259   rtx_insn *insn;
3260   unsigned regno;
3261   unsigned regno2;
3262
3263   for (regno = aarch64_next_callee_save (start, limit);
3264        regno <= limit;
3265        regno = aarch64_next_callee_save (regno + 1, limit))
3266     {
3267       rtx reg, mem;
3268       HOST_WIDE_INT offset;
3269
3270       if (skip_wb
3271           && (regno == cfun->machine->frame.wb_candidate1
3272               || regno == cfun->machine->frame.wb_candidate2))
3273         continue;
3274
3275       if (cfun->machine->reg_is_wrapped_separately[regno])
3276        continue;
3277
3278       reg = gen_rtx_REG (mode, regno);
3279       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3280       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3281                                                 offset));
3282
3283       regno2 = aarch64_next_callee_save (regno + 1, limit);
3284
3285       if (regno2 <= limit
3286           && !cfun->machine->reg_is_wrapped_separately[regno2]
3287           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3288               == cfun->machine->frame.reg_offset[regno2]))
3289
3290         {
3291           rtx reg2 = gen_rtx_REG (mode, regno2);
3292           rtx mem2;
3293
3294           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3295           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3296                                                      offset));
3297           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3298                                                     reg2));
3299
3300           /* The first part of a frame-related parallel insn is
3301              always assumed to be relevant to the frame
3302              calculations; subsequent parts, are only
3303              frame-related if explicitly marked.  */
3304           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3305           regno = regno2;
3306         }
3307       else
3308         insn = emit_move_insn (mem, reg);
3309
3310       RTX_FRAME_RELATED_P (insn) = 1;
3311     }
3312 }
3313
3314 /* Emit code to restore the callee registers of mode MODE from register
3315    number START up to and including LIMIT.  Restore from the stack offset
3316    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3317    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3318
3319 static void
3320 aarch64_restore_callee_saves (machine_mode mode,
3321                               HOST_WIDE_INT start_offset, unsigned start,
3322                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3323 {
3324   rtx base_rtx = stack_pointer_rtx;
3325   unsigned regno;
3326   unsigned regno2;
3327   HOST_WIDE_INT offset;
3328
3329   for (regno = aarch64_next_callee_save (start, limit);
3330        regno <= limit;
3331        regno = aarch64_next_callee_save (regno + 1, limit))
3332     {
3333       if (cfun->machine->reg_is_wrapped_separately[regno])
3334        continue;
3335
3336       rtx reg, mem;
3337
3338       if (skip_wb
3339           && (regno == cfun->machine->frame.wb_candidate1
3340               || regno == cfun->machine->frame.wb_candidate2))
3341         continue;
3342
3343       reg = gen_rtx_REG (mode, regno);
3344       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3345       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3346
3347       regno2 = aarch64_next_callee_save (regno + 1, limit);
3348
3349       if (regno2 <= limit
3350           && !cfun->machine->reg_is_wrapped_separately[regno2]
3351           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3352               == cfun->machine->frame.reg_offset[regno2]))
3353         {
3354           rtx reg2 = gen_rtx_REG (mode, regno2);
3355           rtx mem2;
3356
3357           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3358           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3359           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3360
3361           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3362           regno = regno2;
3363         }
3364       else
3365         emit_move_insn (reg, mem);
3366       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3367     }
3368 }
3369
3370 static inline bool
3371 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3372                                HOST_WIDE_INT offset)
3373 {
3374   return offset >= -256 && offset < 256;
3375 }
3376
3377 static inline bool
3378 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3379 {
3380   return (offset >= 0
3381           && offset < 4096 * GET_MODE_SIZE (mode)
3382           && offset % GET_MODE_SIZE (mode) == 0);
3383 }
3384
3385 bool
3386 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3387 {
3388   return (offset >= -64 * GET_MODE_SIZE (mode)
3389           && offset < 64 * GET_MODE_SIZE (mode)
3390           && offset % GET_MODE_SIZE (mode) == 0);
3391 }
3392
3393 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3394
3395 static sbitmap
3396 aarch64_get_separate_components (void)
3397 {
3398   aarch64_layout_frame ();
3399
3400   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3401   bitmap_clear (components);
3402
3403   /* The registers we need saved to the frame.  */
3404   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3405     if (aarch64_register_saved_on_entry (regno))
3406       {
3407         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3408         if (!frame_pointer_needed)
3409           offset += cfun->machine->frame.frame_size
3410                     - cfun->machine->frame.hard_fp_offset;
3411         /* Check that we can access the stack slot of the register with one
3412            direct load with no adjustments needed.  */
3413         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3414           bitmap_set_bit (components, regno);
3415       }
3416
3417   /* Don't mess with the hard frame pointer.  */
3418   if (frame_pointer_needed)
3419     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3420
3421   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3422   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3423   /* If aarch64_layout_frame has chosen registers to store/restore with
3424      writeback don't interfere with them to avoid having to output explicit
3425      stack adjustment instructions.  */
3426   if (reg2 != INVALID_REGNUM)
3427     bitmap_clear_bit (components, reg2);
3428   if (reg1 != INVALID_REGNUM)
3429     bitmap_clear_bit (components, reg1);
3430
3431   bitmap_clear_bit (components, LR_REGNUM);
3432   bitmap_clear_bit (components, SP_REGNUM);
3433
3434   return components;
3435 }
3436
3437 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3438
3439 static sbitmap
3440 aarch64_components_for_bb (basic_block bb)
3441 {
3442   bitmap in = DF_LIVE_IN (bb);
3443   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3444   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3445
3446   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3447   bitmap_clear (components);
3448
3449   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3450   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3451     if ((!call_used_regs[regno])
3452        && (bitmap_bit_p (in, regno)
3453            || bitmap_bit_p (gen, regno)
3454            || bitmap_bit_p (kill, regno)))
3455           bitmap_set_bit (components, regno);
3456
3457   return components;
3458 }
3459
3460 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3461    Nothing to do for aarch64.  */
3462
3463 static void
3464 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3465 {
3466 }
3467
3468 /* Return the next set bit in BMP from START onwards.  Return the total number
3469    of bits in BMP if no set bit is found at or after START.  */
3470
3471 static unsigned int
3472 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3473 {
3474   unsigned int nbits = SBITMAP_SIZE (bmp);
3475   if (start == nbits)
3476     return start;
3477
3478   gcc_assert (start < nbits);
3479   for (unsigned int i = start; i < nbits; i++)
3480     if (bitmap_bit_p (bmp, i))
3481       return i;
3482
3483   return nbits;
3484 }
3485
3486 /* Do the work for aarch64_emit_prologue_components and
3487    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3488    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3489    for these components or the epilogue sequence.  That is, it determines
3490    whether we should emit stores or loads and what kind of CFA notes to attach
3491    to the insns.  Otherwise the logic for the two sequences is very
3492    similar.  */
3493
3494 static void
3495 aarch64_process_components (sbitmap components, bool prologue_p)
3496 {
3497   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3498                              ? HARD_FRAME_POINTER_REGNUM
3499                              : STACK_POINTER_REGNUM);
3500
3501   unsigned last_regno = SBITMAP_SIZE (components);
3502   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3503   rtx_insn *insn = NULL;
3504
3505   while (regno != last_regno)
3506     {
3507       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3508          so DFmode for the vector registers is enough.  */
3509       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3510       rtx reg = gen_rtx_REG (mode, regno);
3511       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3512       if (!frame_pointer_needed)
3513         offset += cfun->machine->frame.frame_size
3514                   - cfun->machine->frame.hard_fp_offset;
3515       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3516       rtx mem = gen_frame_mem (mode, addr);
3517
3518       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3519       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3520       /* No more registers to handle after REGNO.
3521          Emit a single save/restore and exit.  */
3522       if (regno2 == last_regno)
3523         {
3524           insn = emit_insn (set);
3525           RTX_FRAME_RELATED_P (insn) = 1;
3526           if (prologue_p)
3527             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3528           else
3529             add_reg_note (insn, REG_CFA_RESTORE, reg);
3530           break;
3531         }
3532
3533       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3534       /* The next register is not of the same class or its offset is not
3535          mergeable with the current one into a pair.  */
3536       if (!satisfies_constraint_Ump (mem)
3537           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3538           || (offset2 - cfun->machine->frame.reg_offset[regno])
3539                 != GET_MODE_SIZE (mode))
3540         {
3541           insn = emit_insn (set);
3542           RTX_FRAME_RELATED_P (insn) = 1;
3543           if (prologue_p)
3544             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3545           else
3546             add_reg_note (insn, REG_CFA_RESTORE, reg);
3547
3548           regno = regno2;
3549           continue;
3550         }
3551
3552       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3553       rtx reg2 = gen_rtx_REG (mode, regno2);
3554       if (!frame_pointer_needed)
3555         offset2 += cfun->machine->frame.frame_size
3556                   - cfun->machine->frame.hard_fp_offset;
3557       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3558       rtx mem2 = gen_frame_mem (mode, addr2);
3559       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3560                              : gen_rtx_SET (reg2, mem2);
3561
3562       if (prologue_p)
3563         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3564       else
3565         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3566
3567       RTX_FRAME_RELATED_P (insn) = 1;
3568       if (prologue_p)
3569         {
3570           add_reg_note (insn, REG_CFA_OFFSET, set);
3571           add_reg_note (insn, REG_CFA_OFFSET, set2);
3572         }
3573       else
3574         {
3575           add_reg_note (insn, REG_CFA_RESTORE, reg);
3576           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3577         }
3578
3579       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3580     }
3581 }
3582
3583 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3584
3585 static void
3586 aarch64_emit_prologue_components (sbitmap components)
3587 {
3588   aarch64_process_components (components, true);
3589 }
3590
3591 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3592
3593 static void
3594 aarch64_emit_epilogue_components (sbitmap components)
3595 {
3596   aarch64_process_components (components, false);
3597 }
3598
3599 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3600
3601 static void
3602 aarch64_set_handled_components (sbitmap components)
3603 {
3604   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3605     if (bitmap_bit_p (components, regno))
3606       cfun->machine->reg_is_wrapped_separately[regno] = true;
3607 }
3608
3609 /* AArch64 stack frames generated by this compiler look like:
3610
3611         +-------------------------------+
3612         |                               |
3613         |  incoming stack arguments     |
3614         |                               |
3615         +-------------------------------+
3616         |                               | <-- incoming stack pointer (aligned)
3617         |  callee-allocated save area   |
3618         |  for register varargs         |
3619         |                               |
3620         +-------------------------------+
3621         |  local variables              | <-- frame_pointer_rtx
3622         |                               |
3623         +-------------------------------+
3624         |  padding0                     | \
3625         +-------------------------------+  |
3626         |  callee-saved registers       |  | frame.saved_regs_size
3627         +-------------------------------+  |
3628         |  LR'                          |  |
3629         +-------------------------------+  |
3630         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3631         +-------------------------------+
3632         |  dynamic allocation           |
3633         +-------------------------------+
3634         |  padding                      |
3635         +-------------------------------+
3636         |  outgoing stack arguments     | <-- arg_pointer
3637         |                               |
3638         +-------------------------------+
3639         |                               | <-- stack_pointer_rtx (aligned)
3640
3641    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3642    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3643    unchanged.  */
3644
3645 /* Generate the prologue instructions for entry into a function.
3646    Establish the stack frame by decreasing the stack pointer with a
3647    properly calculated size and, if necessary, create a frame record
3648    filled with the values of LR and previous frame pointer.  The
3649    current FP is also set up if it is in use.  */
3650
3651 void
3652 aarch64_expand_prologue (void)
3653 {
3654   aarch64_layout_frame ();
3655
3656   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3657   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3658   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3659   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3660   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3661   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3662   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3663   rtx_insn *insn;
3664
3665   /* Sign return address for functions.  */
3666   if (aarch64_return_address_signing_enabled ())
3667     {
3668       insn = emit_insn (gen_pacisp ());
3669       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3670       RTX_FRAME_RELATED_P (insn) = 1;
3671     }
3672
3673   if (flag_stack_usage_info)
3674     current_function_static_stack_size = frame_size;
3675
3676   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3677     {
3678       if (crtl->is_leaf && !cfun->calls_alloca)
3679         {
3680           if (frame_size > PROBE_INTERVAL
3681               && frame_size > get_stack_check_protect ())
3682             aarch64_emit_probe_stack_range (get_stack_check_protect (),
3683                                             (frame_size
3684                                              - get_stack_check_protect ()));
3685         }
3686       else if (frame_size > 0)
3687         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
3688     }
3689
3690   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3691
3692   if (callee_adjust != 0)
3693     aarch64_push_regs (reg1, reg2, callee_adjust);
3694
3695   if (frame_pointer_needed)
3696     {
3697       if (callee_adjust == 0)
3698         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3699                                    R30_REGNUM, false);
3700       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3701                                        stack_pointer_rtx,
3702                                        GEN_INT (callee_offset)));
3703       RTX_FRAME_RELATED_P (insn) = 1;
3704       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3705     }
3706
3707   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3708                              callee_adjust != 0 || frame_pointer_needed);
3709   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3710                              callee_adjust != 0 || frame_pointer_needed);
3711   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3712 }
3713
3714 /* Return TRUE if we can use a simple_return insn.
3715
3716    This function checks whether the callee saved stack is empty, which
3717    means no restore actions are need. The pro_and_epilogue will use
3718    this to check whether shrink-wrapping opt is feasible.  */
3719
3720 bool
3721 aarch64_use_return_insn_p (void)
3722 {
3723   if (!reload_completed)
3724     return false;
3725
3726   if (crtl->profile)
3727     return false;
3728
3729   aarch64_layout_frame ();
3730
3731   return cfun->machine->frame.frame_size == 0;
3732 }
3733
3734 /* Generate the epilogue instructions for returning from a function.
3735    This is almost exactly the reverse of the prolog sequence, except
3736    that we need to insert barriers to avoid scheduling loads that read
3737    from a deallocated stack, and we optimize the unwind records by
3738    emitting them all together if possible.  */
3739 void
3740 aarch64_expand_epilogue (bool for_sibcall)
3741 {
3742   aarch64_layout_frame ();
3743
3744   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3745   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3746   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3747   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3748   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3749   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3750   rtx cfi_ops = NULL;
3751   rtx_insn *insn;
3752
3753   /* We need to add memory barrier to prevent read from deallocated stack.  */
3754   bool need_barrier_p = (get_frame_size ()
3755                          + cfun->machine->frame.saved_varargs_size) != 0;
3756
3757   /* Emit a barrier to prevent loads from a deallocated stack.  */
3758   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3759       || crtl->calls_eh_return)
3760     {
3761       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3762       need_barrier_p = false;
3763     }
3764
3765   /* Restore the stack pointer from the frame pointer if it may not
3766      be the same as the stack pointer.  */
3767   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3768     {
3769       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3770                                        hard_frame_pointer_rtx,
3771                                        GEN_INT (-callee_offset)));
3772       /* If writeback is used when restoring callee-saves, the CFA
3773          is restored on the instruction doing the writeback.  */
3774       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3775     }
3776   else
3777     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3778
3779   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3780                                 callee_adjust != 0, &cfi_ops);
3781   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3782                                 callee_adjust != 0, &cfi_ops);
3783
3784   if (need_barrier_p)
3785     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3786
3787   if (callee_adjust != 0)
3788     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3789
3790   if (callee_adjust != 0 || initial_adjust > 65536)
3791     {
3792       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3793       insn = get_last_insn ();
3794       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3795       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3796       RTX_FRAME_RELATED_P (insn) = 1;
3797       cfi_ops = NULL;
3798     }
3799
3800   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3801
3802   if (cfi_ops)
3803     {
3804       /* Emit delayed restores and reset the CFA to be SP.  */
3805       insn = get_last_insn ();
3806       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3807       REG_NOTES (insn) = cfi_ops;
3808       RTX_FRAME_RELATED_P (insn) = 1;
3809     }
3810
3811   /* We prefer to emit the combined return/authenticate instruction RETAA,
3812      however there are three cases in which we must instead emit an explicit
3813      authentication instruction.
3814
3815         1) Sibcalls don't return in a normal way, so if we're about to call one
3816            we must authenticate.
3817
3818         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3819            generating code for !TARGET_ARMV8_3 we can't use it and must
3820            explicitly authenticate.
3821
3822         3) On an eh_return path we make extra stack adjustments to update the
3823            canonical frame address to be the exception handler's CFA.  We want
3824            to authenticate using the CFA of the function which calls eh_return.
3825     */
3826   if (aarch64_return_address_signing_enabled ()
3827       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3828     {
3829       insn = emit_insn (gen_autisp ());
3830       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3831       RTX_FRAME_RELATED_P (insn) = 1;
3832     }
3833
3834   /* Stack adjustment for exception handler.  */
3835   if (crtl->calls_eh_return)
3836     {
3837       /* We need to unwind the stack by the offset computed by
3838          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3839          to be SP; letting the CFA move during this adjustment
3840          is just as correct as retaining the CFA from the body
3841          of the function.  Therefore, do nothing special.  */
3842       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3843     }
3844
3845   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3846   if (!for_sibcall)
3847     emit_jump_insn (ret_rtx);
3848 }
3849
3850 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3851    normally or return to a previous frame after unwinding.
3852
3853    An EH return uses a single shared return sequence.  The epilogue is
3854    exactly like a normal epilogue except that it has an extra input
3855    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3856    that must be applied after the frame has been destroyed.  An extra label
3857    is inserted before the epilogue which initializes this register to zero,
3858    and this is the entry point for a normal return.
3859
3860    An actual EH return updates the return address, initializes the stack
3861    adjustment and jumps directly into the epilogue (bypassing the zeroing
3862    of the adjustment).  Since the return address is typically saved on the
3863    stack when a function makes a call, the saved LR must be updated outside
3864    the epilogue.
3865
3866    This poses problems as the store is generated well before the epilogue,
3867    so the offset of LR is not known yet.  Also optimizations will remove the
3868    store as it appears dead, even after the epilogue is generated (as the
3869    base or offset for loading LR is different in many cases).
3870
3871    To avoid these problems this implementation forces the frame pointer
3872    in eh_return functions so that the location of LR is fixed and known early.
3873    It also marks the store volatile, so no optimization is permitted to
3874    remove the store.  */
3875 rtx
3876 aarch64_eh_return_handler_rtx (void)
3877 {
3878   rtx tmp = gen_frame_mem (Pmode,
3879     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3880
3881   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3882   MEM_VOLATILE_P (tmp) = true;
3883   return tmp;
3884 }
3885
3886 /* Output code to add DELTA to the first argument, and then jump
3887    to FUNCTION.  Used for C++ multiple inheritance.  */
3888 static void
3889 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3890                          HOST_WIDE_INT delta,
3891                          HOST_WIDE_INT vcall_offset,
3892                          tree function)
3893 {
3894   /* The this pointer is always in x0.  Note that this differs from
3895      Arm where the this pointer maybe bumped to r1 if r0 is required
3896      to return a pointer to an aggregate.  On AArch64 a result value
3897      pointer will be in x8.  */
3898   int this_regno = R0_REGNUM;
3899   rtx this_rtx, temp0, temp1, addr, funexp;
3900   rtx_insn *insn;
3901
3902   reload_completed = 1;
3903   emit_note (NOTE_INSN_PROLOGUE_END);
3904
3905   if (vcall_offset == 0)
3906     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3907   else
3908     {
3909       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3910
3911       this_rtx = gen_rtx_REG (Pmode, this_regno);
3912       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3913       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3914
3915       addr = this_rtx;
3916       if (delta != 0)
3917         {
3918           if (delta >= -256 && delta < 256)
3919             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3920                                        plus_constant (Pmode, this_rtx, delta));
3921           else
3922             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3923         }
3924
3925       if (Pmode == ptr_mode)
3926         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3927       else
3928         aarch64_emit_move (temp0,
3929                            gen_rtx_ZERO_EXTEND (Pmode,
3930                                                 gen_rtx_MEM (ptr_mode, addr)));
3931
3932       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3933           addr = plus_constant (Pmode, temp0, vcall_offset);
3934       else
3935         {
3936           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3937                                           Pmode);
3938           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3939         }
3940
3941       if (Pmode == ptr_mode)
3942         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3943       else
3944         aarch64_emit_move (temp1,
3945                            gen_rtx_SIGN_EXTEND (Pmode,
3946                                                 gen_rtx_MEM (ptr_mode, addr)));
3947
3948       emit_insn (gen_add2_insn (this_rtx, temp1));
3949     }
3950
3951   /* Generate a tail call to the target function.  */
3952   if (!TREE_USED (function))
3953     {
3954       assemble_external (function);
3955       TREE_USED (function) = 1;
3956     }
3957   funexp = XEXP (DECL_RTL (function), 0);
3958   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3959   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3960   SIBLING_CALL_P (insn) = 1;
3961
3962   insn = get_insns ();
3963   shorten_branches (insn);
3964   final_start_function (insn, file, 1);
3965   final (insn, file, 1);
3966   final_end_function ();
3967
3968   /* Stop pretending to be a post-reload pass.  */
3969   reload_completed = 0;
3970 }
3971
3972 static bool
3973 aarch64_tls_referenced_p (rtx x)
3974 {
3975   if (!TARGET_HAVE_TLS)
3976     return false;
3977   subrtx_iterator::array_type array;
3978   FOR_EACH_SUBRTX (iter, array, x, ALL)
3979     {
3980       const_rtx x = *iter;
3981       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3982         return true;
3983       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3984          TLS offsets, not real symbol references.  */
3985       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3986         iter.skip_subrtxes ();
3987     }
3988   return false;
3989 }
3990
3991
3992 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3993    a left shift of 0 or 12 bits.  */
3994 bool
3995 aarch64_uimm12_shift (HOST_WIDE_INT val)
3996 {
3997   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3998           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3999           );
4000 }
4001
4002
4003 /* Return true if val is an immediate that can be loaded into a
4004    register by a MOVZ instruction.  */
4005 static bool
4006 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
4007 {
4008   if (GET_MODE_SIZE (mode) > 4)
4009     {
4010       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
4011           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
4012         return 1;
4013     }
4014   else
4015     {
4016       /* Ignore sign extension.  */
4017       val &= (HOST_WIDE_INT) 0xffffffff;
4018     }
4019   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
4020           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
4021 }
4022
4023 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4024
4025 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4026   {
4027     0x0000000100000001ull,
4028     0x0001000100010001ull,
4029     0x0101010101010101ull,
4030     0x1111111111111111ull,
4031     0x5555555555555555ull,
4032   };
4033
4034
4035 /* Return true if val is a valid bitmask immediate.  */
4036
4037 bool
4038 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4039 {
4040   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4041   int bits;
4042
4043   /* Check for a single sequence of one bits and return quickly if so.
4044      The special cases of all ones and all zeroes returns false.  */
4045   val = (unsigned HOST_WIDE_INT) val_in;
4046   tmp = val + (val & -val);
4047
4048   if (tmp == (tmp & -tmp))
4049     return (val + 1) > 1;
4050
4051   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4052   if (mode == SImode)
4053     val = (val << 32) | (val & 0xffffffff);
4054
4055   /* Invert if the immediate doesn't start with a zero bit - this means we
4056      only need to search for sequences of one bits.  */
4057   if (val & 1)
4058     val = ~val;
4059
4060   /* Find the first set bit and set tmp to val with the first sequence of one
4061      bits removed.  Return success if there is a single sequence of ones.  */
4062   first_one = val & -val;
4063   tmp = val & (val + first_one);
4064
4065   if (tmp == 0)
4066     return true;
4067
4068   /* Find the next set bit and compute the difference in bit position.  */
4069   next_one = tmp & -tmp;
4070   bits = clz_hwi (first_one) - clz_hwi (next_one);
4071   mask = val ^ tmp;
4072
4073   /* Check the bit position difference is a power of 2, and that the first
4074      sequence of one bits fits within 'bits' bits.  */
4075   if ((mask >> bits) != 0 || bits != (bits & -bits))
4076     return false;
4077
4078   /* Check the sequence of one bits is repeated 64/bits times.  */
4079   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4080 }
4081
4082 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4083    Assumed precondition: VAL_IN Is not zero.  */
4084
4085 unsigned HOST_WIDE_INT
4086 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4087 {
4088   int lowest_bit_set = ctz_hwi (val_in);
4089   int highest_bit_set = floor_log2 (val_in);
4090   gcc_assert (val_in != 0);
4091
4092   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4093           (HOST_WIDE_INT_1U << lowest_bit_set));
4094 }
4095
4096 /* Create constant where bits outside of lowest bit set to highest bit set
4097    are set to 1.  */
4098
4099 unsigned HOST_WIDE_INT
4100 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4101 {
4102   return val_in | ~aarch64_and_split_imm1 (val_in);
4103 }
4104
4105 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4106
4107 bool
4108 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4109 {
4110   scalar_int_mode int_mode;
4111   if (!is_a <scalar_int_mode> (mode, &int_mode))
4112     return false;
4113
4114   if (aarch64_bitmask_imm (val_in, int_mode))
4115     return false;
4116
4117   if (aarch64_move_imm (val_in, int_mode))
4118     return false;
4119
4120   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4121
4122   return aarch64_bitmask_imm (imm2, int_mode);
4123 }
4124
4125 /* Return true if val is an immediate that can be loaded into a
4126    register in a single instruction.  */
4127 bool
4128 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4129 {
4130   scalar_int_mode int_mode;
4131   if (!is_a <scalar_int_mode> (mode, &int_mode))
4132     return false;
4133
4134   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
4135     return 1;
4136   return aarch64_bitmask_imm (val, int_mode);
4137 }
4138
4139 static bool
4140 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4141 {
4142   rtx base, offset;
4143
4144   if (GET_CODE (x) == HIGH)
4145     return true;
4146
4147   split_const (x, &base, &offset);
4148   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4149     {
4150       if (aarch64_classify_symbol (base, offset)
4151           != SYMBOL_FORCE_TO_MEM)
4152         return true;
4153       else
4154         /* Avoid generating a 64-bit relocation in ILP32; leave
4155            to aarch64_expand_mov_immediate to handle it properly.  */
4156         return mode != ptr_mode;
4157     }
4158
4159   return aarch64_tls_referenced_p (x);
4160 }
4161
4162 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4163    The expansion for a table switch is quite expensive due to the number
4164    of instructions, the table lookup and hard to predict indirect jump.
4165    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4166    set, otherwise use tables for > 16 cases as a tradeoff between size and
4167    performance.  When optimizing for size, use the default setting.  */
4168
4169 static unsigned int
4170 aarch64_case_values_threshold (void)
4171 {
4172   /* Use the specified limit for the number of cases before using jump
4173      tables at higher optimization levels.  */
4174   if (optimize > 2
4175       && selected_cpu->tune->max_case_values != 0)
4176     return selected_cpu->tune->max_case_values;
4177   else
4178     return optimize_size ? default_case_values_threshold () : 17;
4179 }
4180
4181 /* Return true if register REGNO is a valid index register.
4182    STRICT_P is true if REG_OK_STRICT is in effect.  */
4183
4184 bool
4185 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4186 {
4187   if (!HARD_REGISTER_NUM_P (regno))
4188     {
4189       if (!strict_p)
4190         return true;
4191
4192       if (!reg_renumber)
4193         return false;
4194
4195       regno = reg_renumber[regno];
4196     }
4197   return GP_REGNUM_P (regno);
4198 }
4199
4200 /* Return true if register REGNO is a valid base register for mode MODE.
4201    STRICT_P is true if REG_OK_STRICT is in effect.  */
4202
4203 bool
4204 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4205 {
4206   if (!HARD_REGISTER_NUM_P (regno))
4207     {
4208       if (!strict_p)
4209         return true;
4210
4211       if (!reg_renumber)
4212         return false;
4213
4214       regno = reg_renumber[regno];
4215     }
4216
4217   /* The fake registers will be eliminated to either the stack or
4218      hard frame pointer, both of which are usually valid base registers.
4219      Reload deals with the cases where the eliminated form isn't valid.  */
4220   return (GP_REGNUM_P (regno)
4221           || regno == SP_REGNUM
4222           || regno == FRAME_POINTER_REGNUM
4223           || regno == ARG_POINTER_REGNUM);
4224 }
4225
4226 /* Return true if X is a valid base register for mode MODE.
4227    STRICT_P is true if REG_OK_STRICT is in effect.  */
4228
4229 static bool
4230 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4231 {
4232   if (!strict_p
4233       && GET_CODE (x) == SUBREG
4234       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
4235     x = SUBREG_REG (x);
4236
4237   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4238 }
4239
4240 /* Return true if address offset is a valid index.  If it is, fill in INFO
4241    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4242
4243 static bool
4244 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4245                         machine_mode mode, bool strict_p)
4246 {
4247   enum aarch64_address_type type;
4248   rtx index;
4249   int shift;
4250
4251   /* (reg:P) */
4252   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4253       && GET_MODE (x) == Pmode)
4254     {
4255       type = ADDRESS_REG_REG;
4256       index = x;
4257       shift = 0;
4258     }
4259   /* (sign_extend:DI (reg:SI)) */
4260   else if ((GET_CODE (x) == SIGN_EXTEND
4261             || GET_CODE (x) == ZERO_EXTEND)
4262            && GET_MODE (x) == DImode
4263            && GET_MODE (XEXP (x, 0)) == SImode)
4264     {
4265       type = (GET_CODE (x) == SIGN_EXTEND)
4266         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4267       index = XEXP (x, 0);
4268       shift = 0;
4269     }
4270   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4271   else if (GET_CODE (x) == MULT
4272            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4273                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4274            && GET_MODE (XEXP (x, 0)) == DImode
4275            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4276            && CONST_INT_P (XEXP (x, 1)))
4277     {
4278       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4279         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4280       index = XEXP (XEXP (x, 0), 0);
4281       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4282     }
4283   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4284   else if (GET_CODE (x) == ASHIFT
4285            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4286                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4287            && GET_MODE (XEXP (x, 0)) == DImode
4288            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4289            && CONST_INT_P (XEXP (x, 1)))
4290     {
4291       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4292         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4293       index = XEXP (XEXP (x, 0), 0);
4294       shift = INTVAL (XEXP (x, 1));
4295     }
4296   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4297   else if ((GET_CODE (x) == SIGN_EXTRACT
4298             || GET_CODE (x) == ZERO_EXTRACT)
4299            && GET_MODE (x) == DImode
4300            && GET_CODE (XEXP (x, 0)) == MULT
4301            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4302            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4303     {
4304       type = (GET_CODE (x) == SIGN_EXTRACT)
4305         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4306       index = XEXP (XEXP (x, 0), 0);
4307       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4308       if (INTVAL (XEXP (x, 1)) != 32 + shift
4309           || INTVAL (XEXP (x, 2)) != 0)
4310         shift = -1;
4311     }
4312   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4313      (const_int 0xffffffff<<shift)) */
4314   else if (GET_CODE (x) == AND
4315            && GET_MODE (x) == DImode
4316            && GET_CODE (XEXP (x, 0)) == MULT
4317            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4318            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4319            && CONST_INT_P (XEXP (x, 1)))
4320     {
4321       type = ADDRESS_REG_UXTW;
4322       index = XEXP (XEXP (x, 0), 0);
4323       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4324       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4325         shift = -1;
4326     }
4327   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4328   else if ((GET_CODE (x) == SIGN_EXTRACT
4329             || GET_CODE (x) == ZERO_EXTRACT)
4330            && GET_MODE (x) == DImode
4331            && GET_CODE (XEXP (x, 0)) == ASHIFT
4332            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4333            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4334     {
4335       type = (GET_CODE (x) == SIGN_EXTRACT)
4336         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4337       index = XEXP (XEXP (x, 0), 0);
4338       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4339       if (INTVAL (XEXP (x, 1)) != 32 + shift
4340           || INTVAL (XEXP (x, 2)) != 0)
4341         shift = -1;
4342     }
4343   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4344      (const_int 0xffffffff<<shift)) */
4345   else if (GET_CODE (x) == AND
4346            && GET_MODE (x) == DImode
4347            && GET_CODE (XEXP (x, 0)) == ASHIFT
4348            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4349            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4350            && CONST_INT_P (XEXP (x, 1)))
4351     {
4352       type = ADDRESS_REG_UXTW;
4353       index = XEXP (XEXP (x, 0), 0);
4354       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4355       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4356         shift = -1;
4357     }
4358   /* (mult:P (reg:P) (const_int scale)) */
4359   else if (GET_CODE (x) == MULT
4360            && GET_MODE (x) == Pmode
4361            && GET_MODE (XEXP (x, 0)) == Pmode
4362            && CONST_INT_P (XEXP (x, 1)))
4363     {
4364       type = ADDRESS_REG_REG;
4365       index = XEXP (x, 0);
4366       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4367     }
4368   /* (ashift:P (reg:P) (const_int shift)) */
4369   else if (GET_CODE (x) == ASHIFT
4370            && GET_MODE (x) == Pmode
4371            && GET_MODE (XEXP (x, 0)) == Pmode
4372            && CONST_INT_P (XEXP (x, 1)))
4373     {
4374       type = ADDRESS_REG_REG;
4375       index = XEXP (x, 0);
4376       shift = INTVAL (XEXP (x, 1));
4377     }
4378   else
4379     return false;
4380
4381   if (!strict_p
4382       && GET_CODE (index) == SUBREG
4383       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
4384     index = SUBREG_REG (index);
4385
4386   if ((shift == 0 ||
4387        (shift > 0 && shift <= 3
4388         && (1 << shift) == GET_MODE_SIZE (mode)))
4389       && REG_P (index)
4390       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4391     {
4392       info->type = type;
4393       info->offset = index;
4394       info->shift = shift;
4395       return true;
4396     }
4397
4398   return false;
4399 }
4400
4401 /* Return true if MODE is one of the modes for which we
4402    support LDP/STP operations.  */
4403
4404 static bool
4405 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4406 {
4407   return mode == SImode || mode == DImode
4408          || mode == SFmode || mode == DFmode
4409          || (aarch64_vector_mode_supported_p (mode)
4410              && GET_MODE_SIZE (mode) == 8);
4411 }
4412
4413 /* Return true if REGNO is a virtual pointer register, or an eliminable
4414    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4415    include stack_pointer or hard_frame_pointer.  */
4416 static bool
4417 virt_or_elim_regno_p (unsigned regno)
4418 {
4419   return ((regno >= FIRST_VIRTUAL_REGISTER
4420            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4421           || regno == FRAME_POINTER_REGNUM
4422           || regno == ARG_POINTER_REGNUM);
4423 }
4424
4425 /* Return true if X is a valid address for machine mode MODE.  If it is,
4426    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4427    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4428
4429 static bool
4430 aarch64_classify_address (struct aarch64_address_info *info,
4431                           rtx x, machine_mode mode,
4432                           RTX_CODE outer_code, bool strict_p)
4433 {
4434   enum rtx_code code = GET_CODE (x);
4435   rtx op0, op1;
4436
4437   /* On BE, we use load/store pair for all large int mode load/stores.
4438      TI/TFmode may also use a load/store pair.  */
4439   bool load_store_pair_p = (outer_code == PARALLEL
4440                             || mode == TImode
4441                             || mode == TFmode
4442                             || (BYTES_BIG_ENDIAN
4443                                 && aarch64_vect_struct_mode_p (mode)));
4444
4445   bool allow_reg_index_p =
4446     !load_store_pair_p
4447     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4448     && !aarch64_vect_struct_mode_p (mode);
4449
4450   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4451      REG addressing.  */
4452   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4453       && (code != POST_INC && code != REG))
4454     return false;
4455
4456   switch (code)
4457     {
4458     case REG:
4459     case SUBREG:
4460       info->type = ADDRESS_REG_IMM;
4461       info->base = x;
4462       info->offset = const0_rtx;
4463       return aarch64_base_register_rtx_p (x, strict_p);
4464
4465     case PLUS:
4466       op0 = XEXP (x, 0);
4467       op1 = XEXP (x, 1);
4468
4469       if (! strict_p
4470           && REG_P (op0)
4471           && virt_or_elim_regno_p (REGNO (op0))
4472           && CONST_INT_P (op1))
4473         {
4474           info->type = ADDRESS_REG_IMM;
4475           info->base = op0;
4476           info->offset = op1;
4477
4478           return true;
4479         }
4480
4481       if (GET_MODE_SIZE (mode) != 0
4482           && CONST_INT_P (op1)
4483           && aarch64_base_register_rtx_p (op0, strict_p))
4484         {
4485           HOST_WIDE_INT offset = INTVAL (op1);
4486
4487           info->type = ADDRESS_REG_IMM;
4488           info->base = op0;
4489           info->offset = op1;
4490
4491           /* TImode and TFmode values are allowed in both pairs of X
4492              registers and individual Q registers.  The available
4493              address modes are:
4494              X,X: 7-bit signed scaled offset
4495              Q:   9-bit signed offset
4496              We conservatively require an offset representable in either mode.
4497              When performing the check for pairs of X registers i.e.  LDP/STP
4498              pass down DImode since that is the natural size of the LDP/STP
4499              instruction memory accesses.  */
4500           if (mode == TImode || mode == TFmode)
4501             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4502                     && (offset_9bit_signed_unscaled_p (mode, offset)
4503                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4504
4505           /* A 7bit offset check because OImode will emit a ldp/stp
4506              instruction (only big endian will get here).
4507              For ldp/stp instructions, the offset is scaled for the size of a
4508              single element of the pair.  */
4509           if (mode == OImode)
4510             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4511
4512           /* Three 9/12 bit offsets checks because CImode will emit three
4513              ldr/str instructions (only big endian will get here).  */
4514           if (mode == CImode)
4515             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4516                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4517                         || offset_12bit_unsigned_scaled_p (V16QImode,
4518                                                            offset + 32)));
4519
4520           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4521              instructions (only big endian will get here).  */
4522           if (mode == XImode)
4523             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4524                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4525                                                             offset + 32));
4526
4527           if (load_store_pair_p)
4528             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4529                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4530           else
4531             return (offset_9bit_signed_unscaled_p (mode, offset)
4532                     || offset_12bit_unsigned_scaled_p (mode, offset));
4533         }
4534
4535       if (allow_reg_index_p)
4536         {
4537           /* Look for base + (scaled/extended) index register.  */
4538           if (aarch64_base_register_rtx_p (op0, strict_p)
4539               && aarch64_classify_index (info, op1, mode, strict_p))
4540             {
4541               info->base = op0;
4542               return true;
4543             }
4544           if (aarch64_base_register_rtx_p (op1, strict_p)
4545               && aarch64_classify_index (info, op0, mode, strict_p))
4546             {
4547               info->base = op1;
4548               return true;
4549             }
4550         }
4551
4552       return false;
4553
4554     case POST_INC:
4555     case POST_DEC:
4556     case PRE_INC:
4557     case PRE_DEC:
4558       info->type = ADDRESS_REG_WB;
4559       info->base = XEXP (x, 0);
4560       info->offset = NULL_RTX;
4561       return aarch64_base_register_rtx_p (info->base, strict_p);
4562
4563     case POST_MODIFY:
4564     case PRE_MODIFY:
4565       info->type = ADDRESS_REG_WB;
4566       info->base = XEXP (x, 0);
4567       if (GET_CODE (XEXP (x, 1)) == PLUS
4568           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4569           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4570           && aarch64_base_register_rtx_p (info->base, strict_p))
4571         {
4572           HOST_WIDE_INT offset;
4573           info->offset = XEXP (XEXP (x, 1), 1);
4574           offset = INTVAL (info->offset);
4575
4576           /* TImode and TFmode values are allowed in both pairs of X
4577              registers and individual Q registers.  The available
4578              address modes are:
4579              X,X: 7-bit signed scaled offset
4580              Q:   9-bit signed offset
4581              We conservatively require an offset representable in either mode.
4582            */
4583           if (mode == TImode || mode == TFmode)
4584             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4585                     && offset_9bit_signed_unscaled_p (mode, offset));
4586
4587           if (load_store_pair_p)
4588             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4589                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4590           else
4591             return offset_9bit_signed_unscaled_p (mode, offset);
4592         }
4593       return false;
4594
4595     case CONST:
4596     case SYMBOL_REF:
4597     case LABEL_REF:
4598       /* load literal: pc-relative constant pool entry.  Only supported
4599          for SI mode or larger.  */
4600       info->type = ADDRESS_SYMBOLIC;
4601
4602       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4603         {
4604           rtx sym, addend;
4605
4606           split_const (x, &sym, &addend);
4607           return ((GET_CODE (sym) == LABEL_REF
4608                    || (GET_CODE (sym) == SYMBOL_REF
4609                        && CONSTANT_POOL_ADDRESS_P (sym)
4610                        && aarch64_pcrelative_literal_loads)));
4611         }
4612       return false;
4613
4614     case LO_SUM:
4615       info->type = ADDRESS_LO_SUM;
4616       info->base = XEXP (x, 0);
4617       info->offset = XEXP (x, 1);
4618       if (allow_reg_index_p
4619           && aarch64_base_register_rtx_p (info->base, strict_p))
4620         {
4621           rtx sym, offs;
4622           split_const (info->offset, &sym, &offs);
4623           if (GET_CODE (sym) == SYMBOL_REF
4624               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4625             {
4626               /* The symbol and offset must be aligned to the access size.  */
4627               unsigned int align;
4628               unsigned int ref_size;
4629
4630               if (CONSTANT_POOL_ADDRESS_P (sym))
4631                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4632               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4633                 {
4634                   tree exp = SYMBOL_REF_DECL (sym);
4635                   align = TYPE_ALIGN (TREE_TYPE (exp));
4636                   align = aarch64_constant_alignment (exp, align);
4637                 }
4638               else if (SYMBOL_REF_DECL (sym))
4639                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4640               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4641                        && SYMBOL_REF_BLOCK (sym) != NULL)
4642                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4643               else
4644                 align = BITS_PER_UNIT;
4645
4646               ref_size = GET_MODE_SIZE (mode);
4647               if (ref_size == 0)
4648                 ref_size = GET_MODE_SIZE (DImode);
4649
4650               return ((INTVAL (offs) & (ref_size - 1)) == 0
4651                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4652             }
4653         }
4654       return false;
4655
4656     default:
4657       return false;
4658     }
4659 }
4660
4661 /* Return true if the address X is valid for a PRFM instruction.
4662    STRICT_P is true if we should do strict checking with
4663    aarch64_classify_address.  */
4664
4665 bool
4666 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4667 {
4668   struct aarch64_address_info addr;
4669
4670   /* PRFM accepts the same addresses as DImode...  */
4671   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4672   if (!res)
4673     return false;
4674
4675   /* ... except writeback forms.  */
4676   return addr.type != ADDRESS_REG_WB;
4677 }
4678
4679 bool
4680 aarch64_symbolic_address_p (rtx x)
4681 {
4682   rtx offset;
4683
4684   split_const (x, &x, &offset);
4685   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4686 }
4687
4688 /* Classify the base of symbolic expression X.  */
4689
4690 enum aarch64_symbol_type
4691 aarch64_classify_symbolic_expression (rtx x)
4692 {
4693   rtx offset;
4694
4695   split_const (x, &x, &offset);
4696   return aarch64_classify_symbol (x, offset);
4697 }
4698
4699
4700 /* Return TRUE if X is a legitimate address for accessing memory in
4701    mode MODE.  */
4702 static bool
4703 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4704 {
4705   struct aarch64_address_info addr;
4706
4707   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4708 }
4709
4710 /* Return TRUE if X is a legitimate address for accessing memory in
4711    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4712    pair operation.  */
4713 bool
4714 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4715                               RTX_CODE outer_code, bool strict_p)
4716 {
4717   struct aarch64_address_info addr;
4718
4719   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4720 }
4721
4722 /* Split an out-of-range address displacement into a base and offset.
4723    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4724    to increase opportunities for sharing the base address of different sizes.
4725    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4726 static bool
4727 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4728 {
4729   HOST_WIDE_INT offset = INTVAL (*disp);
4730   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4731
4732   if (mode == TImode || mode == TFmode
4733       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4734     base = (offset + 0x100) & ~0x1ff;
4735
4736   *off = GEN_INT (base);
4737   *disp = GEN_INT (offset - base);
4738   return true;
4739 }
4740
4741 /* Return the binary representation of floating point constant VALUE in INTVAL.
4742    If the value cannot be converted, return false without setting INTVAL.
4743    The conversion is done in the given MODE.  */
4744 bool
4745 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4746 {
4747
4748   /* We make a general exception for 0.  */
4749   if (aarch64_float_const_zero_rtx_p (value))
4750     {
4751       *intval = 0;
4752       return true;
4753     }
4754
4755   machine_mode mode = GET_MODE (value);
4756   if (GET_CODE (value) != CONST_DOUBLE
4757       || !SCALAR_FLOAT_MODE_P (mode)
4758       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4759       /* Only support up to DF mode.  */
4760       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4761     return false;
4762
4763   unsigned HOST_WIDE_INT ival = 0;
4764
4765   long res[2];
4766   real_to_target (res,
4767                   CONST_DOUBLE_REAL_VALUE (value),
4768                   REAL_MODE_FORMAT (mode));
4769
4770   if (mode == DFmode)
4771     {
4772       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4773       ival = zext_hwi (res[order], 32);
4774       ival |= (zext_hwi (res[1 - order], 32) << 32);
4775     }
4776   else
4777       ival = zext_hwi (res[0], 32);
4778
4779   *intval = ival;
4780   return true;
4781 }
4782
4783 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4784    single MOV(+MOVK) followed by an FMOV.  */
4785 bool
4786 aarch64_float_const_rtx_p (rtx x)
4787 {
4788   machine_mode mode = GET_MODE (x);
4789   if (mode == VOIDmode)
4790     return false;
4791
4792   /* Determine whether it's cheaper to write float constants as
4793      mov/movk pairs over ldr/adrp pairs.  */
4794   unsigned HOST_WIDE_INT ival;
4795
4796   if (GET_CODE (x) == CONST_DOUBLE
4797       && SCALAR_FLOAT_MODE_P (mode)
4798       && aarch64_reinterpret_float_as_int (x, &ival))
4799     {
4800       scalar_int_mode imode = (mode == HFmode
4801                                ? SImode
4802                                : int_mode_for_mode (mode).require ());
4803       int num_instr = aarch64_internal_mov_immediate
4804                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4805       return num_instr < 3;
4806     }
4807
4808   return false;
4809 }
4810
4811 /* Return TRUE if rtx X is immediate constant 0.0 */
4812 bool
4813 aarch64_float_const_zero_rtx_p (rtx x)
4814 {
4815   if (GET_MODE (x) == VOIDmode)
4816     return false;
4817
4818   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4819     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4820   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4821 }
4822
4823 /* Return TRUE if rtx X is immediate constant that fits in a single
4824    MOVI immediate operation.  */
4825 bool
4826 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4827 {
4828   if (!TARGET_SIMD)
4829      return false;
4830
4831   machine_mode vmode;
4832   scalar_int_mode imode;
4833   unsigned HOST_WIDE_INT ival;
4834
4835   if (GET_CODE (x) == CONST_DOUBLE
4836       && SCALAR_FLOAT_MODE_P (mode))
4837     {
4838       if (!aarch64_reinterpret_float_as_int (x, &ival))
4839         return false;
4840
4841       /* We make a general exception for 0.  */
4842       if (aarch64_float_const_zero_rtx_p (x))
4843         return true;
4844
4845       imode = int_mode_for_mode (mode).require ();
4846     }
4847   else if (GET_CODE (x) == CONST_INT
4848            && is_a <scalar_int_mode> (mode, &imode))
4849     ival = INTVAL (x);
4850   else
4851     return false;
4852
4853    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4854      a 128 bit vector mode.  */
4855   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
4856
4857   vmode = aarch64_simd_container_mode (imode, width);
4858   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4859
4860   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4861 }
4862
4863
4864 /* Return the fixed registers used for condition codes.  */
4865
4866 static bool
4867 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4868 {
4869   *p1 = CC_REGNUM;
4870   *p2 = INVALID_REGNUM;
4871   return true;
4872 }
4873
4874 /* This function is used by the call expanders of the machine description.
4875    RESULT is the register in which the result is returned.  It's NULL for
4876    "call" and "sibcall".
4877    MEM is the location of the function call.
4878    SIBCALL indicates whether this function call is normal call or sibling call.
4879    It will generate different pattern accordingly.  */
4880
4881 void
4882 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4883 {
4884   rtx call, callee, tmp;
4885   rtvec vec;
4886   machine_mode mode;
4887
4888   gcc_assert (MEM_P (mem));
4889   callee = XEXP (mem, 0);
4890   mode = GET_MODE (callee);
4891   gcc_assert (mode == Pmode);
4892
4893   /* Decide if we should generate indirect calls by loading the
4894      address of the callee into a register before performing
4895      the branch-and-link.  */
4896   if (SYMBOL_REF_P (callee)
4897       ? (aarch64_is_long_call_p (callee)
4898          || aarch64_is_noplt_call_p (callee))
4899       : !REG_P (callee))
4900     XEXP (mem, 0) = force_reg (mode, callee);
4901
4902   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4903
4904   if (result != NULL_RTX)
4905     call = gen_rtx_SET (result, call);
4906
4907   if (sibcall)
4908     tmp = ret_rtx;
4909   else
4910     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4911
4912   vec = gen_rtvec (2, call, tmp);
4913   call = gen_rtx_PARALLEL (VOIDmode, vec);
4914
4915   aarch64_emit_call_insn (call);
4916 }
4917
4918 /* Emit call insn with PAT and do aarch64-specific handling.  */
4919
4920 void
4921 aarch64_emit_call_insn (rtx pat)
4922 {
4923   rtx insn = emit_call_insn (pat);
4924
4925   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4926   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4927   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4928 }
4929
4930 machine_mode
4931 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4932 {
4933   /* All floating point compares return CCFP if it is an equality
4934      comparison, and CCFPE otherwise.  */
4935   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4936     {
4937       switch (code)
4938         {
4939         case EQ:
4940         case NE:
4941         case UNORDERED:
4942         case ORDERED:
4943         case UNLT:
4944         case UNLE:
4945         case UNGT:
4946         case UNGE:
4947         case UNEQ:
4948         case LTGT:
4949           return CCFPmode;
4950
4951         case LT:
4952         case LE:
4953         case GT:
4954         case GE:
4955           return CCFPEmode;
4956
4957         default:
4958           gcc_unreachable ();
4959         }
4960     }
4961
4962   /* Equality comparisons of short modes against zero can be performed
4963      using the TST instruction with the appropriate bitmask.  */
4964   if (y == const0_rtx && REG_P (x)
4965       && (code == EQ || code == NE)
4966       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4967     return CC_NZmode;
4968
4969   /* Similarly, comparisons of zero_extends from shorter modes can
4970      be performed using an ANDS with an immediate mask.  */
4971   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4972       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4973       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4974       && (code == EQ || code == NE))
4975     return CC_NZmode;
4976
4977   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4978       && y == const0_rtx
4979       && (code == EQ || code == NE || code == LT || code == GE)
4980       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4981           || GET_CODE (x) == NEG
4982           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4983               && CONST_INT_P (XEXP (x, 2)))))
4984     return CC_NZmode;
4985
4986   /* A compare with a shifted operand.  Because of canonicalization,
4987      the comparison will have to be swapped when we emit the assembly
4988      code.  */
4989   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4990       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4991       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4992           || GET_CODE (x) == LSHIFTRT
4993           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4994     return CC_SWPmode;
4995
4996   /* Similarly for a negated operand, but we can only do this for
4997      equalities.  */
4998   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4999       && (REG_P (y) || GET_CODE (y) == SUBREG)
5000       && (code == EQ || code == NE)
5001       && GET_CODE (x) == NEG)
5002     return CC_Zmode;
5003
5004   /* A test for unsigned overflow.  */
5005   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
5006       && code == NE
5007       && GET_CODE (x) == PLUS
5008       && GET_CODE (y) == ZERO_EXTEND)
5009     return CC_Cmode;
5010
5011   /* For everything else, return CCmode.  */
5012   return CCmode;
5013 }
5014
5015 static int
5016 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
5017
5018 int
5019 aarch64_get_condition_code (rtx x)
5020 {
5021   machine_mode mode = GET_MODE (XEXP (x, 0));
5022   enum rtx_code comp_code = GET_CODE (x);
5023
5024   if (GET_MODE_CLASS (mode) != MODE_CC)
5025     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
5026   return aarch64_get_condition_code_1 (mode, comp_code);
5027 }
5028
5029 static int
5030 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
5031 {
5032   switch (mode)
5033     {
5034     case E_CCFPmode:
5035     case E_CCFPEmode:
5036       switch (comp_code)
5037         {
5038         case GE: return AARCH64_GE;
5039         case GT: return AARCH64_GT;
5040         case LE: return AARCH64_LS;
5041         case LT: return AARCH64_MI;
5042         case NE: return AARCH64_NE;
5043         case EQ: return AARCH64_EQ;
5044         case ORDERED: return AARCH64_VC;
5045         case UNORDERED: return AARCH64_VS;
5046         case UNLT: return AARCH64_LT;
5047         case UNLE: return AARCH64_LE;
5048         case UNGT: return AARCH64_HI;
5049         case UNGE: return AARCH64_PL;
5050         default: return -1;
5051         }
5052       break;
5053
5054     case E_CCmode:
5055       switch (comp_code)
5056         {
5057         case NE: return AARCH64_NE;
5058         case EQ: return AARCH64_EQ;
5059         case GE: return AARCH64_GE;
5060         case GT: return AARCH64_GT;
5061         case LE: return AARCH64_LE;
5062         case LT: return AARCH64_LT;
5063         case GEU: return AARCH64_CS;
5064         case GTU: return AARCH64_HI;
5065         case LEU: return AARCH64_LS;
5066         case LTU: return AARCH64_CC;
5067         default: return -1;
5068         }
5069       break;
5070
5071     case E_CC_SWPmode:
5072       switch (comp_code)
5073         {
5074         case NE: return AARCH64_NE;
5075         case EQ: return AARCH64_EQ;
5076         case GE: return AARCH64_LE;
5077         case GT: return AARCH64_LT;
5078         case LE: return AARCH64_GE;
5079         case LT: return AARCH64_GT;
5080         case GEU: return AARCH64_LS;
5081         case GTU: return AARCH64_CC;
5082         case LEU: return AARCH64_CS;
5083         case LTU: return AARCH64_HI;
5084         default: return -1;
5085         }
5086       break;
5087
5088     case E_CC_NZmode:
5089       switch (comp_code)
5090         {
5091         case NE: return AARCH64_NE;
5092         case EQ: return AARCH64_EQ;
5093         case GE: return AARCH64_PL;
5094         case LT: return AARCH64_MI;
5095         default: return -1;
5096         }
5097       break;
5098
5099     case E_CC_Zmode:
5100       switch (comp_code)
5101         {
5102         case NE: return AARCH64_NE;
5103         case EQ: return AARCH64_EQ;
5104         default: return -1;
5105         }
5106       break;
5107
5108     case E_CC_Cmode:
5109       switch (comp_code)
5110         {
5111         case NE: return AARCH64_CS;
5112         case EQ: return AARCH64_CC;
5113         default: return -1;
5114         }
5115       break;
5116
5117     default:
5118       return -1;
5119     }
5120
5121   return -1;
5122 }
5123
5124 bool
5125 aarch64_const_vec_all_same_in_range_p (rtx x,
5126                                   HOST_WIDE_INT minval,
5127                                   HOST_WIDE_INT maxval)
5128 {
5129   HOST_WIDE_INT firstval;
5130   int count, i;
5131
5132   if (GET_CODE (x) != CONST_VECTOR
5133       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5134     return false;
5135
5136   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5137   if (firstval < minval || firstval > maxval)
5138     return false;
5139
5140   count = CONST_VECTOR_NUNITS (x);
5141   for (i = 1; i < count; i++)
5142     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5143       return false;
5144
5145   return true;
5146 }
5147
5148 bool
5149 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5150 {
5151   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5152 }
5153
5154
5155 /* N Z C V.  */
5156 #define AARCH64_CC_V 1
5157 #define AARCH64_CC_C (1 << 1)
5158 #define AARCH64_CC_Z (1 << 2)
5159 #define AARCH64_CC_N (1 << 3)
5160
5161 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5162 static const int aarch64_nzcv_codes[] =
5163 {
5164   0,            /* EQ, Z == 1.  */
5165   AARCH64_CC_Z, /* NE, Z == 0.  */
5166   0,            /* CS, C == 1.  */
5167   AARCH64_CC_C, /* CC, C == 0.  */
5168   0,            /* MI, N == 1.  */
5169   AARCH64_CC_N, /* PL, N == 0.  */
5170   0,            /* VS, V == 1.  */
5171   AARCH64_CC_V, /* VC, V == 0.  */
5172   0,            /* HI, C ==1 && Z == 0.  */
5173   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5174   AARCH64_CC_V, /* GE, N == V.  */
5175   0,            /* LT, N != V.  */
5176   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5177   0,            /* LE, !(Z == 0 && N == V).  */
5178   0,            /* AL, Any.  */
5179   0             /* NV, Any.  */
5180 };
5181
5182 /* Print operand X to file F in a target specific manner according to CODE.
5183    The acceptable formatting commands given by CODE are:
5184      'c':               An integer or symbol address without a preceding #
5185                         sign.
5186      'e':               Print the sign/zero-extend size as a character 8->b,
5187                         16->h, 32->w.
5188      'p':               Prints N such that 2^N == X (X must be power of 2 and
5189                         const int).
5190      'P':               Print the number of non-zero bits in X (a const_int).
5191      'H':               Print the higher numbered register of a pair (TImode)
5192                         of regs.
5193      'm':               Print a condition (eq, ne, etc).
5194      'M':               Same as 'm', but invert condition.
5195      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5196      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5197                         The register printed is the FP/SIMD register name
5198                         of X + 0/1/2/3 for S/T/U/V.
5199      'R':               Print a scalar FP/SIMD register name + 1.
5200      'X':               Print bottom 16 bits of integer constant in hex.
5201      'w/x':             Print a general register name or the zero register
5202                         (32-bit or 64-bit).
5203      '0':               Print a normal operand, if it's a general register,
5204                         then we assume DImode.
5205      'k':               Print NZCV for conditional compare instructions.
5206      'A':               Output address constant representing the first
5207                         argument of X, specifying a relocation offset
5208                         if appropriate.
5209      'L':               Output constant address specified by X
5210                         with a relocation offset if appropriate.
5211      'G':               Prints address of X, specifying a PC relative
5212                         relocation mode if appropriate.  */
5213
5214 static void
5215 aarch64_print_operand (FILE *f, rtx x, int code)
5216 {
5217   switch (code)
5218     {
5219     case 'c':
5220       switch (GET_CODE (x))
5221         {
5222         case CONST_INT:
5223           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5224           break;
5225
5226         case SYMBOL_REF:
5227           output_addr_const (f, x);
5228           break;
5229
5230         case CONST:
5231           if (GET_CODE (XEXP (x, 0)) == PLUS
5232               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5233             {
5234               output_addr_const (f, x);
5235               break;
5236             }
5237           /* Fall through.  */
5238
5239         default:
5240           output_operand_lossage ("Unsupported operand for code '%c'", code);
5241         }
5242       break;
5243
5244     case 'e':
5245       {
5246         int n;
5247
5248         if (!CONST_INT_P (x)
5249             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5250           {
5251             output_operand_lossage ("invalid operand for '%%%c'", code);
5252             return;
5253           }
5254
5255         switch (n)
5256           {
5257           case 3:
5258             fputc ('b', f);
5259             break;
5260           case 4:
5261             fputc ('h', f);
5262             break;
5263           case 5:
5264             fputc ('w', f);
5265             break;
5266           default:
5267             output_operand_lossage ("invalid operand for '%%%c'", code);
5268             return;
5269           }
5270       }
5271       break;
5272
5273     case 'p':
5274       {
5275         int n;
5276
5277         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5278           {
5279             output_operand_lossage ("invalid operand for '%%%c'", code);
5280             return;
5281           }
5282
5283         asm_fprintf (f, "%d", n);
5284       }
5285       break;
5286
5287     case 'P':
5288       if (!CONST_INT_P (x))
5289         {
5290           output_operand_lossage ("invalid operand for '%%%c'", code);
5291           return;
5292         }
5293
5294       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5295       break;
5296
5297     case 'H':
5298       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5299         {
5300           output_operand_lossage ("invalid operand for '%%%c'", code);
5301           return;
5302         }
5303
5304       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5305       break;
5306
5307     case 'M':
5308     case 'm':
5309       {
5310         int cond_code;
5311         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5312         if (x == const_true_rtx)
5313           {
5314             if (code == 'M')
5315               fputs ("nv", f);
5316             return;
5317           }
5318
5319         if (!COMPARISON_P (x))
5320           {
5321             output_operand_lossage ("invalid operand for '%%%c'", code);
5322             return;
5323           }
5324
5325         cond_code = aarch64_get_condition_code (x);
5326         gcc_assert (cond_code >= 0);
5327         if (code == 'M')
5328           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5329         fputs (aarch64_condition_codes[cond_code], f);
5330       }
5331       break;
5332
5333     case 'b':
5334     case 'h':
5335     case 's':
5336     case 'd':
5337     case 'q':
5338       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5339         {
5340           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5341           return;
5342         }
5343       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5344       break;
5345
5346     case 'S':
5347     case 'T':
5348     case 'U':
5349     case 'V':
5350       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5351         {
5352           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5353           return;
5354         }
5355       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5356       break;
5357
5358     case 'R':
5359       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5360         {
5361           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5362           return;
5363         }
5364       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5365       break;
5366
5367     case 'X':
5368       if (!CONST_INT_P (x))
5369         {
5370           output_operand_lossage ("invalid operand for '%%%c'", code);
5371           return;
5372         }
5373       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5374       break;
5375
5376     case 'w':
5377     case 'x':
5378       if (x == const0_rtx
5379           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5380         {
5381           asm_fprintf (f, "%czr", code);
5382           break;
5383         }
5384
5385       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5386         {
5387           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5388           break;
5389         }
5390
5391       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5392         {
5393           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5394           break;
5395         }
5396
5397       /* Fall through */
5398
5399     case 0:
5400       if (x == NULL)
5401         {
5402           output_operand_lossage ("missing operand");
5403           return;
5404         }
5405
5406       switch (GET_CODE (x))
5407         {
5408         case REG:
5409           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5410           break;
5411
5412         case MEM:
5413           output_address (GET_MODE (x), XEXP (x, 0));
5414           /* Check all memory references are Pmode - even with ILP32.  */
5415           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5416           break;
5417
5418         case CONST:
5419         case LABEL_REF:
5420         case SYMBOL_REF:
5421           output_addr_const (asm_out_file, x);
5422           break;
5423
5424         case CONST_INT:
5425           asm_fprintf (f, "%wd", INTVAL (x));
5426           break;
5427
5428         case CONST_VECTOR:
5429           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5430             {
5431               gcc_assert (
5432                   aarch64_const_vec_all_same_in_range_p (x,
5433                                                          HOST_WIDE_INT_MIN,
5434                                                          HOST_WIDE_INT_MAX));
5435               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5436             }
5437           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5438             {
5439               fputc ('0', f);
5440             }
5441           else
5442             gcc_unreachable ();
5443           break;
5444
5445         case CONST_DOUBLE:
5446           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5447              be getting CONST_DOUBLEs holding integers.  */
5448           gcc_assert (GET_MODE (x) != VOIDmode);
5449           if (aarch64_float_const_zero_rtx_p (x))
5450             {
5451               fputc ('0', f);
5452               break;
5453             }
5454           else if (aarch64_float_const_representable_p (x))
5455             {
5456 #define buf_size 20
5457               char float_buf[buf_size] = {'\0'};
5458               real_to_decimal_for_mode (float_buf,
5459                                         CONST_DOUBLE_REAL_VALUE (x),
5460                                         buf_size, buf_size,
5461                                         1, GET_MODE (x));
5462               asm_fprintf (asm_out_file, "%s", float_buf);
5463               break;
5464 #undef buf_size
5465             }
5466           output_operand_lossage ("invalid constant");
5467           return;
5468         default:
5469           output_operand_lossage ("invalid operand");
5470           return;
5471         }
5472       break;
5473
5474     case 'A':
5475       if (GET_CODE (x) == HIGH)
5476         x = XEXP (x, 0);
5477
5478       switch (aarch64_classify_symbolic_expression (x))
5479         {
5480         case SYMBOL_SMALL_GOT_4G:
5481           asm_fprintf (asm_out_file, ":got:");
5482           break;
5483
5484         case SYMBOL_SMALL_TLSGD:
5485           asm_fprintf (asm_out_file, ":tlsgd:");
5486           break;
5487
5488         case SYMBOL_SMALL_TLSDESC:
5489           asm_fprintf (asm_out_file, ":tlsdesc:");
5490           break;
5491
5492         case SYMBOL_SMALL_TLSIE:
5493           asm_fprintf (asm_out_file, ":gottprel:");
5494           break;
5495
5496         case SYMBOL_TLSLE24:
5497           asm_fprintf (asm_out_file, ":tprel:");
5498           break;
5499
5500         case SYMBOL_TINY_GOT:
5501           gcc_unreachable ();
5502           break;
5503
5504         default:
5505           break;
5506         }
5507       output_addr_const (asm_out_file, x);
5508       break;
5509
5510     case 'L':
5511       switch (aarch64_classify_symbolic_expression (x))
5512         {
5513         case SYMBOL_SMALL_GOT_4G:
5514           asm_fprintf (asm_out_file, ":lo12:");
5515           break;
5516
5517         case SYMBOL_SMALL_TLSGD:
5518           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5519           break;
5520
5521         case SYMBOL_SMALL_TLSDESC:
5522           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5523           break;
5524
5525         case SYMBOL_SMALL_TLSIE:
5526           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5527           break;
5528
5529         case SYMBOL_TLSLE12:
5530           asm_fprintf (asm_out_file, ":tprel_lo12:");
5531           break;
5532
5533         case SYMBOL_TLSLE24:
5534           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5535           break;
5536
5537         case SYMBOL_TINY_GOT:
5538           asm_fprintf (asm_out_file, ":got:");
5539           break;
5540
5541         case SYMBOL_TINY_TLSIE:
5542           asm_fprintf (asm_out_file, ":gottprel:");
5543           break;
5544
5545         default:
5546           break;
5547         }
5548       output_addr_const (asm_out_file, x);
5549       break;
5550
5551     case 'G':
5552       switch (aarch64_classify_symbolic_expression (x))
5553         {
5554         case SYMBOL_TLSLE24:
5555           asm_fprintf (asm_out_file, ":tprel_hi12:");
5556           break;
5557         default:
5558           break;
5559         }
5560       output_addr_const (asm_out_file, x);
5561       break;
5562
5563     case 'k':
5564       {
5565         HOST_WIDE_INT cond_code;
5566
5567         if (!CONST_INT_P (x))
5568           {
5569             output_operand_lossage ("invalid operand for '%%%c'", code);
5570             return;
5571           }
5572
5573         cond_code = INTVAL (x);
5574         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5575         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5576       }
5577       break;
5578
5579     default:
5580       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5581       return;
5582     }
5583 }
5584
5585 static void
5586 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5587 {
5588   struct aarch64_address_info addr;
5589
5590   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5591     switch (addr.type)
5592       {
5593       case ADDRESS_REG_IMM:
5594         if (addr.offset == const0_rtx)
5595           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5596         else
5597           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5598                        INTVAL (addr.offset));
5599         return;
5600
5601       case ADDRESS_REG_REG:
5602         if (addr.shift == 0)
5603           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5604                        reg_names [REGNO (addr.offset)]);
5605         else
5606           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5607                        reg_names [REGNO (addr.offset)], addr.shift);
5608         return;
5609
5610       case ADDRESS_REG_UXTW:
5611         if (addr.shift == 0)
5612           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5613                        REGNO (addr.offset) - R0_REGNUM);
5614         else
5615           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5616                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5617         return;
5618
5619       case ADDRESS_REG_SXTW:
5620         if (addr.shift == 0)
5621           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5622                        REGNO (addr.offset) - R0_REGNUM);
5623         else
5624           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5625                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5626         return;
5627
5628       case ADDRESS_REG_WB:
5629         switch (GET_CODE (x))
5630           {
5631           case PRE_INC:
5632             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5633                          GET_MODE_SIZE (mode));
5634             return;
5635           case POST_INC:
5636             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5637                          GET_MODE_SIZE (mode));
5638             return;
5639           case PRE_DEC:
5640             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5641                          GET_MODE_SIZE (mode));
5642             return;
5643           case POST_DEC:
5644             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5645                          GET_MODE_SIZE (mode));
5646             return;
5647           case PRE_MODIFY:
5648             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5649                          INTVAL (addr.offset));
5650             return;
5651           case POST_MODIFY:
5652             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5653                          INTVAL (addr.offset));
5654             return;
5655           default:
5656             break;
5657           }
5658         break;
5659
5660       case ADDRESS_LO_SUM:
5661         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5662         output_addr_const (f, addr.offset);
5663         asm_fprintf (f, "]");
5664         return;
5665
5666       case ADDRESS_SYMBOLIC:
5667         break;
5668       }
5669
5670   output_addr_const (f, x);
5671 }
5672
5673 bool
5674 aarch64_label_mentioned_p (rtx x)
5675 {
5676   const char *fmt;
5677   int i;
5678
5679   if (GET_CODE (x) == LABEL_REF)
5680     return true;
5681
5682   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5683      referencing instruction, but they are constant offsets, not
5684      symbols.  */
5685   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5686     return false;
5687
5688   fmt = GET_RTX_FORMAT (GET_CODE (x));
5689   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5690     {
5691       if (fmt[i] == 'E')
5692         {
5693           int j;
5694
5695           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5696             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5697               return 1;
5698         }
5699       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5700         return 1;
5701     }
5702
5703   return 0;
5704 }
5705
5706 /* Implement REGNO_REG_CLASS.  */
5707
5708 enum reg_class
5709 aarch64_regno_regclass (unsigned regno)
5710 {
5711   if (GP_REGNUM_P (regno))
5712     return GENERAL_REGS;
5713
5714   if (regno == SP_REGNUM)
5715     return STACK_REG;
5716
5717   if (regno == FRAME_POINTER_REGNUM
5718       || regno == ARG_POINTER_REGNUM)
5719     return POINTER_REGS;
5720
5721   if (FP_REGNUM_P (regno))
5722     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5723
5724   return NO_REGS;
5725 }
5726
5727 static rtx
5728 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5729 {
5730   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5731      where mask is selected by alignment and size of the offset.
5732      We try to pick as large a range for the offset as possible to
5733      maximize the chance of a CSE.  However, for aligned addresses
5734      we limit the range to 4k so that structures with different sized
5735      elements are likely to use the same base.  We need to be careful
5736      not to split a CONST for some forms of address expression, otherwise
5737      it will generate sub-optimal code.  */
5738
5739   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5740     {
5741       rtx base = XEXP (x, 0);
5742       rtx offset_rtx = XEXP (x, 1);
5743       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5744
5745       if (GET_CODE (base) == PLUS)
5746         {
5747           rtx op0 = XEXP (base, 0);
5748           rtx op1 = XEXP (base, 1);
5749
5750           /* Force any scaling into a temp for CSE.  */
5751           op0 = force_reg (Pmode, op0);
5752           op1 = force_reg (Pmode, op1);
5753
5754           /* Let the pointer register be in op0.  */
5755           if (REG_POINTER (op1))
5756             std::swap (op0, op1);
5757
5758           /* If the pointer is virtual or frame related, then we know that
5759              virtual register instantiation or register elimination is going
5760              to apply a second constant.  We want the two constants folded
5761              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5762           if (virt_or_elim_regno_p (REGNO (op0)))
5763             {
5764               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5765                                    NULL_RTX, true, OPTAB_DIRECT);
5766               return gen_rtx_PLUS (Pmode, base, op1);
5767             }
5768
5769           /* Otherwise, in order to encourage CSE (and thence loop strength
5770              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5771           base = expand_binop (Pmode, add_optab, op0, op1,
5772                                NULL_RTX, true, OPTAB_DIRECT);
5773           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5774         }
5775
5776       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5777       HOST_WIDE_INT base_offset;
5778       if (GET_MODE_SIZE (mode) > 16)
5779         base_offset = (offset + 0x400) & ~0x7f0;
5780       /* For offsets aren't a multiple of the access size, the limit is
5781          -256...255.  */
5782       else if (offset & (GET_MODE_SIZE (mode) - 1))
5783         {
5784           base_offset = (offset + 0x100) & ~0x1ff;
5785
5786           /* BLKmode typically uses LDP of X-registers.  */
5787           if (mode == BLKmode)
5788             base_offset = (offset + 512) & ~0x3ff;
5789         }
5790       /* Small negative offsets are supported.  */
5791       else if (IN_RANGE (offset, -256, 0))
5792         base_offset = 0;
5793       else if (mode == TImode || mode == TFmode)
5794         base_offset = (offset + 0x100) & ~0x1ff;
5795       /* Use 12-bit offset by access size.  */
5796       else
5797         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5798
5799       if (base_offset != 0)
5800         {
5801           base = plus_constant (Pmode, base, base_offset);
5802           base = force_operand (base, NULL_RTX);
5803           return plus_constant (Pmode, base, offset - base_offset);
5804         }
5805     }
5806
5807   return x;
5808 }
5809
5810 /* Return the reload icode required for a constant pool in mode.  */
5811 static enum insn_code
5812 aarch64_constant_pool_reload_icode (machine_mode mode)
5813 {
5814   switch (mode)
5815     {
5816     case E_SFmode:
5817       return CODE_FOR_aarch64_reload_movcpsfdi;
5818
5819     case E_DFmode:
5820       return CODE_FOR_aarch64_reload_movcpdfdi;
5821
5822     case E_TFmode:
5823       return CODE_FOR_aarch64_reload_movcptfdi;
5824
5825     case E_V8QImode:
5826       return CODE_FOR_aarch64_reload_movcpv8qidi;
5827
5828     case E_V16QImode:
5829       return CODE_FOR_aarch64_reload_movcpv16qidi;
5830
5831     case E_V4HImode:
5832       return CODE_FOR_aarch64_reload_movcpv4hidi;
5833
5834     case E_V8HImode:
5835       return CODE_FOR_aarch64_reload_movcpv8hidi;
5836
5837     case E_V2SImode:
5838       return CODE_FOR_aarch64_reload_movcpv2sidi;
5839
5840     case E_V4SImode:
5841       return CODE_FOR_aarch64_reload_movcpv4sidi;
5842
5843     case E_V2DImode:
5844       return CODE_FOR_aarch64_reload_movcpv2didi;
5845
5846     case E_V2DFmode:
5847       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5848
5849     default:
5850       gcc_unreachable ();
5851     }
5852
5853   gcc_unreachable ();
5854 }
5855 static reg_class_t
5856 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5857                           reg_class_t rclass,
5858                           machine_mode mode,
5859                           secondary_reload_info *sri)
5860 {
5861
5862   /* If we have to disable direct literal pool loads and stores because the
5863      function is too big, then we need a scratch register.  */
5864   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5865       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5866           || targetm.vector_mode_supported_p (GET_MODE (x)))
5867       && !aarch64_pcrelative_literal_loads)
5868     {
5869       sri->icode = aarch64_constant_pool_reload_icode (mode);
5870       return NO_REGS;
5871     }
5872
5873   /* Without the TARGET_SIMD instructions we cannot move a Q register
5874      to a Q register directly.  We need a scratch.  */
5875   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5876       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5877       && reg_class_subset_p (rclass, FP_REGS))
5878     {
5879       if (mode == TFmode)
5880         sri->icode = CODE_FOR_aarch64_reload_movtf;
5881       else if (mode == TImode)
5882         sri->icode = CODE_FOR_aarch64_reload_movti;
5883       return NO_REGS;
5884     }
5885
5886   /* A TFmode or TImode memory access should be handled via an FP_REGS
5887      because AArch64 has richer addressing modes for LDR/STR instructions
5888      than LDP/STP instructions.  */
5889   if (TARGET_FLOAT && rclass == GENERAL_REGS
5890       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5891     return FP_REGS;
5892
5893   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5894       return GENERAL_REGS;
5895
5896   return NO_REGS;
5897 }
5898
5899 static bool
5900 aarch64_can_eliminate (const int from, const int to)
5901 {
5902   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5903      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5904
5905   if (frame_pointer_needed)
5906     {
5907       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5908         return true;
5909       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5910         return false;
5911       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5912           && !cfun->calls_alloca)
5913         return true;
5914       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5915         return true;
5916
5917       return false;
5918     }
5919   else
5920     {
5921       /* If we decided that we didn't need a leaf frame pointer but then used
5922          LR in the function, then we'll want a frame pointer after all, so
5923          prevent this elimination to ensure a frame pointer is used.  */
5924       if (to == STACK_POINTER_REGNUM
5925           && flag_omit_leaf_frame_pointer
5926           && df_regs_ever_live_p (LR_REGNUM))
5927         return false;
5928     }
5929
5930   return true;
5931 }
5932
5933 HOST_WIDE_INT
5934 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5935 {
5936   aarch64_layout_frame ();
5937
5938   if (to == HARD_FRAME_POINTER_REGNUM)
5939     {
5940       if (from == ARG_POINTER_REGNUM)
5941         return cfun->machine->frame.hard_fp_offset;
5942
5943       if (from == FRAME_POINTER_REGNUM)
5944         return cfun->machine->frame.hard_fp_offset
5945                - cfun->machine->frame.locals_offset;
5946     }
5947
5948   if (to == STACK_POINTER_REGNUM)
5949     {
5950       if (from == FRAME_POINTER_REGNUM)
5951           return cfun->machine->frame.frame_size
5952                  - cfun->machine->frame.locals_offset;
5953     }
5954
5955   return cfun->machine->frame.frame_size;
5956 }
5957
5958 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5959    previous frame.  */
5960
5961 rtx
5962 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5963 {
5964   if (count != 0)
5965     return const0_rtx;
5966   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5967 }
5968
5969
5970 static void
5971 aarch64_asm_trampoline_template (FILE *f)
5972 {
5973   if (TARGET_ILP32)
5974     {
5975       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5976       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5977     }
5978   else
5979     {
5980       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5981       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5982     }
5983   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5984   assemble_aligned_integer (4, const0_rtx);
5985   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5986   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5987 }
5988
5989 static void
5990 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5991 {
5992   rtx fnaddr, mem, a_tramp;
5993   const int tramp_code_sz = 16;
5994
5995   /* Don't need to copy the trailing D-words, we fill those in below.  */
5996   emit_block_move (m_tramp, assemble_trampoline_template (),
5997                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5998   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5999   fnaddr = XEXP (DECL_RTL (fndecl), 0);
6000   if (GET_MODE (fnaddr) != ptr_mode)
6001     fnaddr = convert_memory_address (ptr_mode, fnaddr);
6002   emit_move_insn (mem, fnaddr);
6003
6004   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
6005   emit_move_insn (mem, chain_value);
6006
6007   /* XXX We should really define a "clear_cache" pattern and use
6008      gen_clear_cache().  */
6009   a_tramp = XEXP (m_tramp, 0);
6010   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
6011                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
6012                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
6013                      ptr_mode);
6014 }
6015
6016 static unsigned char
6017 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
6018 {
6019   switch (regclass)
6020     {
6021     case CALLER_SAVE_REGS:
6022     case POINTER_REGS:
6023     case GENERAL_REGS:
6024     case ALL_REGS:
6025     case POINTER_AND_FP_REGS:
6026     case FP_REGS:
6027     case FP_LO_REGS:
6028       return
6029         aarch64_vector_mode_p (mode)
6030           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
6031           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6032     case STACK_REG:
6033       return 1;
6034
6035     case NO_REGS:
6036       return 0;
6037
6038     default:
6039       break;
6040     }
6041   gcc_unreachable ();
6042 }
6043
6044 static reg_class_t
6045 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6046 {
6047   if (regclass == POINTER_REGS)
6048     return GENERAL_REGS;
6049
6050   if (regclass == STACK_REG)
6051     {
6052       if (REG_P(x)
6053           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6054           return regclass;
6055
6056       return NO_REGS;
6057     }
6058
6059   /* Register eliminiation can result in a request for
6060      SP+constant->FP_REGS.  We cannot support such operations which
6061      use SP as source and an FP_REG as destination, so reject out
6062      right now.  */
6063   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6064     {
6065       rtx lhs = XEXP (x, 0);
6066
6067       /* Look through a possible SUBREG introduced by ILP32.  */
6068       if (GET_CODE (lhs) == SUBREG)
6069         lhs = SUBREG_REG (lhs);
6070
6071       gcc_assert (REG_P (lhs));
6072       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6073                                       POINTER_REGS));
6074       return NO_REGS;
6075     }
6076
6077   return regclass;
6078 }
6079
6080 void
6081 aarch64_asm_output_labelref (FILE* f, const char *name)
6082 {
6083   asm_fprintf (f, "%U%s", name);
6084 }
6085
6086 static void
6087 aarch64_elf_asm_constructor (rtx symbol, int priority)
6088 {
6089   if (priority == DEFAULT_INIT_PRIORITY)
6090     default_ctor_section_asm_out_constructor (symbol, priority);
6091   else
6092     {
6093       section *s;
6094       /* While priority is known to be in range [0, 65535], so 18 bytes
6095          would be enough, the compiler might not know that.  To avoid
6096          -Wformat-truncation false positive, use a larger size.  */
6097       char buf[23];
6098       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6099       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6100       switch_to_section (s);
6101       assemble_align (POINTER_SIZE);
6102       assemble_aligned_integer (POINTER_BYTES, symbol);
6103     }
6104 }
6105
6106 static void
6107 aarch64_elf_asm_destructor (rtx symbol, int priority)
6108 {
6109   if (priority == DEFAULT_INIT_PRIORITY)
6110     default_dtor_section_asm_out_destructor (symbol, priority);
6111   else
6112     {
6113       section *s;
6114       /* While priority is known to be in range [0, 65535], so 18 bytes
6115          would be enough, the compiler might not know that.  To avoid
6116          -Wformat-truncation false positive, use a larger size.  */
6117       char buf[23];
6118       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6119       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
6120       switch_to_section (s);
6121       assemble_align (POINTER_SIZE);
6122       assemble_aligned_integer (POINTER_BYTES, symbol);
6123     }
6124 }
6125
6126 const char*
6127 aarch64_output_casesi (rtx *operands)
6128 {
6129   char buf[100];
6130   char label[100];
6131   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6132   int index;
6133   static const char *const patterns[4][2] =
6134   {
6135     {
6136       "ldrb\t%w3, [%0,%w1,uxtw]",
6137       "add\t%3, %4, %w3, sxtb #2"
6138     },
6139     {
6140       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6141       "add\t%3, %4, %w3, sxth #2"
6142     },
6143     {
6144       "ldr\t%w3, [%0,%w1,uxtw #2]",
6145       "add\t%3, %4, %w3, sxtw #2"
6146     },
6147     /* We assume that DImode is only generated when not optimizing and
6148        that we don't really need 64-bit address offsets.  That would
6149        imply an object file with 8GB of code in a single function!  */
6150     {
6151       "ldr\t%w3, [%0,%w1,uxtw #2]",
6152       "add\t%3, %4, %w3, sxtw #2"
6153     }
6154   };
6155
6156   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6157
6158   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
6159   index = exact_log2 (GET_MODE_SIZE (mode));
6160
6161   gcc_assert (index >= 0 && index <= 3);
6162
6163   /* Need to implement table size reduction, by chaning the code below.  */
6164   output_asm_insn (patterns[index][0], operands);
6165   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6166   snprintf (buf, sizeof (buf),
6167             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6168   output_asm_insn (buf, operands);
6169   output_asm_insn (patterns[index][1], operands);
6170   output_asm_insn ("br\t%3", operands);
6171   assemble_label (asm_out_file, label);
6172   return "";
6173 }
6174
6175
6176 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6177    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6178    operator.  */
6179
6180 int
6181 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6182 {
6183   if (shift >= 0 && shift <= 3)
6184     {
6185       int size;
6186       for (size = 8; size <= 32; size *= 2)
6187         {
6188           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6189           if (mask == bits << shift)
6190             return size;
6191         }
6192     }
6193   return 0;
6194 }
6195
6196 /* Constant pools are per function only when PC relative
6197    literal loads are true or we are in the large memory
6198    model.  */
6199
6200 static inline bool
6201 aarch64_can_use_per_function_literal_pools_p (void)
6202 {
6203   return (aarch64_pcrelative_literal_loads
6204           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6205 }
6206
6207 static bool
6208 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6209 {
6210   /* Fixme:: In an ideal world this would work similar
6211      to the logic in aarch64_select_rtx_section but this
6212      breaks bootstrap in gcc go.  For now we workaround
6213      this by returning false here.  */
6214   return false;
6215 }
6216
6217 /* Select appropriate section for constants depending
6218    on where we place literal pools.  */
6219
6220 static section *
6221 aarch64_select_rtx_section (machine_mode mode,
6222                             rtx x,
6223                             unsigned HOST_WIDE_INT align)
6224 {
6225   if (aarch64_can_use_per_function_literal_pools_p ())
6226     return function_section (current_function_decl);
6227
6228   return default_elf_select_rtx_section (mode, x, align);
6229 }
6230
6231 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6232 void
6233 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6234                                   HOST_WIDE_INT offset)
6235 {
6236   /* When using per-function literal pools, we must ensure that any code
6237      section is aligned to the minimal instruction length, lest we get
6238      errors from the assembler re "unaligned instructions".  */
6239   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6240     ASM_OUTPUT_ALIGN (f, 2);
6241 }
6242
6243 /* Costs.  */
6244
6245 /* Helper function for rtx cost calculation.  Strip a shift expression
6246    from X.  Returns the inner operand if successful, or the original
6247    expression on failure.  */
6248 static rtx
6249 aarch64_strip_shift (rtx x)
6250 {
6251   rtx op = x;
6252
6253   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6254      we can convert both to ROR during final output.  */
6255   if ((GET_CODE (op) == ASHIFT
6256        || GET_CODE (op) == ASHIFTRT
6257        || GET_CODE (op) == LSHIFTRT
6258        || GET_CODE (op) == ROTATERT
6259        || GET_CODE (op) == ROTATE)
6260       && CONST_INT_P (XEXP (op, 1)))
6261     return XEXP (op, 0);
6262
6263   if (GET_CODE (op) == MULT
6264       && CONST_INT_P (XEXP (op, 1))
6265       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6266     return XEXP (op, 0);
6267
6268   return x;
6269 }
6270
6271 /* Helper function for rtx cost calculation.  Strip an extend
6272    expression from X.  Returns the inner operand if successful, or the
6273    original expression on failure.  We deal with a number of possible
6274    canonicalization variations here. If STRIP_SHIFT is true, then
6275    we can strip off a shift also.  */
6276 static rtx
6277 aarch64_strip_extend (rtx x, bool strip_shift)
6278 {
6279   scalar_int_mode mode;
6280   rtx op = x;
6281
6282   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
6283     return op;
6284
6285   /* Zero and sign extraction of a widened value.  */
6286   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6287       && XEXP (op, 2) == const0_rtx
6288       && GET_CODE (XEXP (op, 0)) == MULT
6289       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
6290                                          XEXP (op, 1)))
6291     return XEXP (XEXP (op, 0), 0);
6292
6293   /* It can also be represented (for zero-extend) as an AND with an
6294      immediate.  */
6295   if (GET_CODE (op) == AND
6296       && GET_CODE (XEXP (op, 0)) == MULT
6297       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6298       && CONST_INT_P (XEXP (op, 1))
6299       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6300                            INTVAL (XEXP (op, 1))) != 0)
6301     return XEXP (XEXP (op, 0), 0);
6302
6303   /* Now handle extended register, as this may also have an optional
6304      left shift by 1..4.  */
6305   if (strip_shift
6306       && GET_CODE (op) == ASHIFT
6307       && CONST_INT_P (XEXP (op, 1))
6308       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6309     op = XEXP (op, 0);
6310
6311   if (GET_CODE (op) == ZERO_EXTEND
6312       || GET_CODE (op) == SIGN_EXTEND)
6313     op = XEXP (op, 0);
6314
6315   if (op != x)
6316     return op;
6317
6318   return x;
6319 }
6320
6321 /* Return true iff CODE is a shift supported in combination
6322    with arithmetic instructions.  */
6323
6324 static bool
6325 aarch64_shift_p (enum rtx_code code)
6326 {
6327   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6328 }
6329
6330
6331 /* Return true iff X is a cheap shift without a sign extend. */
6332
6333 static bool
6334 aarch64_cheap_mult_shift_p (rtx x)
6335 {
6336   rtx op0, op1;
6337
6338   op0 = XEXP (x, 0);
6339   op1 = XEXP (x, 1);
6340
6341   if (!(aarch64_tune_params.extra_tuning_flags
6342                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6343     return false;
6344
6345   if (GET_CODE (op0) == SIGN_EXTEND)
6346     return false;
6347
6348   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6349       && UINTVAL (op1) <= 4)
6350     return true;
6351
6352   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6353     return false;
6354
6355   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6356
6357   if (l2 > 0 && l2 <= 4)
6358     return true;
6359
6360   return false;
6361 }
6362
6363 /* Helper function for rtx cost calculation.  Calculate the cost of
6364    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6365    Return the calculated cost of the expression, recursing manually in to
6366    operands where needed.  */
6367
6368 static int
6369 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6370 {
6371   rtx op0, op1;
6372   const struct cpu_cost_table *extra_cost
6373     = aarch64_tune_params.insn_extra_cost;
6374   int cost = 0;
6375   bool compound_p = (outer == PLUS || outer == MINUS);
6376   machine_mode mode = GET_MODE (x);
6377
6378   gcc_checking_assert (code == MULT);
6379
6380   op0 = XEXP (x, 0);
6381   op1 = XEXP (x, 1);
6382
6383   if (VECTOR_MODE_P (mode))
6384     mode = GET_MODE_INNER (mode);
6385
6386   /* Integer multiply/fma.  */
6387   if (GET_MODE_CLASS (mode) == MODE_INT)
6388     {
6389       /* The multiply will be canonicalized as a shift, cost it as such.  */
6390       if (aarch64_shift_p (GET_CODE (x))
6391           || (CONST_INT_P (op1)
6392               && exact_log2 (INTVAL (op1)) > 0))
6393         {
6394           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6395                            || GET_CODE (op0) == SIGN_EXTEND;
6396           if (speed)
6397             {
6398               if (compound_p)
6399                 {
6400                   /* If the shift is considered cheap,
6401                      then don't add any cost. */
6402                   if (aarch64_cheap_mult_shift_p (x))
6403                     ;
6404                   else if (REG_P (op1))
6405                     /* ARITH + shift-by-register.  */
6406                     cost += extra_cost->alu.arith_shift_reg;
6407                   else if (is_extend)
6408                     /* ARITH + extended register.  We don't have a cost field
6409                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6410                     cost += extra_cost->alu.extend_arith;
6411                   else
6412                     /* ARITH + shift-by-immediate.  */
6413                     cost += extra_cost->alu.arith_shift;
6414                 }
6415               else
6416                 /* LSL (immediate).  */
6417                 cost += extra_cost->alu.shift;
6418
6419             }
6420           /* Strip extends as we will have costed them in the case above.  */
6421           if (is_extend)
6422             op0 = aarch64_strip_extend (op0, true);
6423
6424           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6425
6426           return cost;
6427         }
6428
6429       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6430          compound and let the below cases handle it.  After all, MNEG is a
6431          special-case alias of MSUB.  */
6432       if (GET_CODE (op0) == NEG)
6433         {
6434           op0 = XEXP (op0, 0);
6435           compound_p = true;
6436         }
6437
6438       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6439       if ((GET_CODE (op0) == ZERO_EXTEND
6440            && GET_CODE (op1) == ZERO_EXTEND)
6441           || (GET_CODE (op0) == SIGN_EXTEND
6442               && GET_CODE (op1) == SIGN_EXTEND))
6443         {
6444           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6445           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6446
6447           if (speed)
6448             {
6449               if (compound_p)
6450                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6451                 cost += extra_cost->mult[0].extend_add;
6452               else
6453                 /* MUL/SMULL/UMULL.  */
6454                 cost += extra_cost->mult[0].extend;
6455             }
6456
6457           return cost;
6458         }
6459
6460       /* This is either an integer multiply or a MADD.  In both cases
6461          we want to recurse and cost the operands.  */
6462       cost += rtx_cost (op0, mode, MULT, 0, speed);
6463       cost += rtx_cost (op1, mode, MULT, 1, speed);
6464
6465       if (speed)
6466         {
6467           if (compound_p)
6468             /* MADD/MSUB.  */
6469             cost += extra_cost->mult[mode == DImode].add;
6470           else
6471             /* MUL.  */
6472             cost += extra_cost->mult[mode == DImode].simple;
6473         }
6474
6475       return cost;
6476     }
6477   else
6478     {
6479       if (speed)
6480         {
6481           /* Floating-point FMA/FMUL can also support negations of the
6482              operands, unless the rounding mode is upward or downward in
6483              which case FNMUL is different than FMUL with operand negation.  */
6484           bool neg0 = GET_CODE (op0) == NEG;
6485           bool neg1 = GET_CODE (op1) == NEG;
6486           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6487             {
6488               if (neg0)
6489                 op0 = XEXP (op0, 0);
6490               if (neg1)
6491                 op1 = XEXP (op1, 0);
6492             }
6493
6494           if (compound_p)
6495             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6496             cost += extra_cost->fp[mode == DFmode].fma;
6497           else
6498             /* FMUL/FNMUL.  */
6499             cost += extra_cost->fp[mode == DFmode].mult;
6500         }
6501
6502       cost += rtx_cost (op0, mode, MULT, 0, speed);
6503       cost += rtx_cost (op1, mode, MULT, 1, speed);
6504       return cost;
6505     }
6506 }
6507
6508 static int
6509 aarch64_address_cost (rtx x,
6510                       machine_mode mode,
6511                       addr_space_t as ATTRIBUTE_UNUSED,
6512                       bool speed)
6513 {
6514   enum rtx_code c = GET_CODE (x);
6515   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6516   struct aarch64_address_info info;
6517   int cost = 0;
6518   info.shift = 0;
6519
6520   if (!aarch64_classify_address (&info, x, mode, c, false))
6521     {
6522       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6523         {
6524           /* This is a CONST or SYMBOL ref which will be split
6525              in a different way depending on the code model in use.
6526              Cost it through the generic infrastructure.  */
6527           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6528           /* Divide through by the cost of one instruction to
6529              bring it to the same units as the address costs.  */
6530           cost_symbol_ref /= COSTS_N_INSNS (1);
6531           /* The cost is then the cost of preparing the address,
6532              followed by an immediate (possibly 0) offset.  */
6533           return cost_symbol_ref + addr_cost->imm_offset;
6534         }
6535       else
6536         {
6537           /* This is most likely a jump table from a case
6538              statement.  */
6539           return addr_cost->register_offset;
6540         }
6541     }
6542
6543   switch (info.type)
6544     {
6545       case ADDRESS_LO_SUM:
6546       case ADDRESS_SYMBOLIC:
6547       case ADDRESS_REG_IMM:
6548         cost += addr_cost->imm_offset;
6549         break;
6550
6551       case ADDRESS_REG_WB:
6552         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6553           cost += addr_cost->pre_modify;
6554         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6555           cost += addr_cost->post_modify;
6556         else
6557           gcc_unreachable ();
6558
6559         break;
6560
6561       case ADDRESS_REG_REG:
6562         cost += addr_cost->register_offset;
6563         break;
6564
6565       case ADDRESS_REG_SXTW:
6566         cost += addr_cost->register_sextend;
6567         break;
6568
6569       case ADDRESS_REG_UXTW:
6570         cost += addr_cost->register_zextend;
6571         break;
6572
6573       default:
6574         gcc_unreachable ();
6575     }
6576
6577
6578   if (info.shift > 0)
6579     {
6580       /* For the sake of calculating the cost of the shifted register
6581          component, we can treat same sized modes in the same way.  */
6582       switch (GET_MODE_BITSIZE (mode))
6583         {
6584           case 16:
6585             cost += addr_cost->addr_scale_costs.hi;
6586             break;
6587
6588           case 32:
6589             cost += addr_cost->addr_scale_costs.si;
6590             break;
6591
6592           case 64:
6593             cost += addr_cost->addr_scale_costs.di;
6594             break;
6595
6596           /* We can't tell, or this is a 128-bit vector.  */
6597           default:
6598             cost += addr_cost->addr_scale_costs.ti;
6599             break;
6600         }
6601     }
6602
6603   return cost;
6604 }
6605
6606 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6607    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6608    to be taken.  */
6609
6610 int
6611 aarch64_branch_cost (bool speed_p, bool predictable_p)
6612 {
6613   /* When optimizing for speed, use the cost of unpredictable branches.  */
6614   const struct cpu_branch_cost *branch_costs =
6615     aarch64_tune_params.branch_costs;
6616
6617   if (!speed_p || predictable_p)
6618     return branch_costs->predictable;
6619   else
6620     return branch_costs->unpredictable;
6621 }
6622
6623 /* Return true if the RTX X in mode MODE is a zero or sign extract
6624    usable in an ADD or SUB (extended register) instruction.  */
6625 static bool
6626 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
6627 {
6628   /* Catch add with a sign extract.
6629      This is add_<optab><mode>_multp2.  */
6630   if (GET_CODE (x) == SIGN_EXTRACT
6631       || GET_CODE (x) == ZERO_EXTRACT)
6632     {
6633       rtx op0 = XEXP (x, 0);
6634       rtx op1 = XEXP (x, 1);
6635       rtx op2 = XEXP (x, 2);
6636
6637       if (GET_CODE (op0) == MULT
6638           && CONST_INT_P (op1)
6639           && op2 == const0_rtx
6640           && CONST_INT_P (XEXP (op0, 1))
6641           && aarch64_is_extend_from_extract (mode,
6642                                              XEXP (op0, 1),
6643                                              op1))
6644         {
6645           return true;
6646         }
6647     }
6648   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6649      No shift.  */
6650   else if (GET_CODE (x) == SIGN_EXTEND
6651            || GET_CODE (x) == ZERO_EXTEND)
6652     return REG_P (XEXP (x, 0));
6653
6654   return false;
6655 }
6656
6657 static bool
6658 aarch64_frint_unspec_p (unsigned int u)
6659 {
6660   switch (u)
6661     {
6662       case UNSPEC_FRINTZ:
6663       case UNSPEC_FRINTP:
6664       case UNSPEC_FRINTM:
6665       case UNSPEC_FRINTA:
6666       case UNSPEC_FRINTN:
6667       case UNSPEC_FRINTX:
6668       case UNSPEC_FRINTI:
6669         return true;
6670
6671       default:
6672         return false;
6673     }
6674 }
6675
6676 /* Return true iff X is an rtx that will match an extr instruction
6677    i.e. as described in the *extr<mode>5_insn family of patterns.
6678    OP0 and OP1 will be set to the operands of the shifts involved
6679    on success and will be NULL_RTX otherwise.  */
6680
6681 static bool
6682 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6683 {
6684   rtx op0, op1;
6685   scalar_int_mode mode;
6686   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
6687     return false;
6688
6689   *res_op0 = NULL_RTX;
6690   *res_op1 = NULL_RTX;
6691
6692   if (GET_CODE (x) != IOR)
6693     return false;
6694
6695   op0 = XEXP (x, 0);
6696   op1 = XEXP (x, 1);
6697
6698   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6699       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6700     {
6701      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6702       if (GET_CODE (op1) == ASHIFT)
6703         std::swap (op0, op1);
6704
6705       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6706         return false;
6707
6708       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6709       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6710
6711       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6712           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6713         {
6714           *res_op0 = XEXP (op0, 0);
6715           *res_op1 = XEXP (op1, 0);
6716           return true;
6717         }
6718     }
6719
6720   return false;
6721 }
6722
6723 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6724    storing it in *COST.  Result is true if the total cost of the operation
6725    has now been calculated.  */
6726 static bool
6727 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6728 {
6729   rtx inner;
6730   rtx comparator;
6731   enum rtx_code cmpcode;
6732
6733   if (COMPARISON_P (op0))
6734     {
6735       inner = XEXP (op0, 0);
6736       comparator = XEXP (op0, 1);
6737       cmpcode = GET_CODE (op0);
6738     }
6739   else
6740     {
6741       inner = op0;
6742       comparator = const0_rtx;
6743       cmpcode = NE;
6744     }
6745
6746   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6747     {
6748       /* Conditional branch.  */
6749       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6750         return true;
6751       else
6752         {
6753           if (cmpcode == NE || cmpcode == EQ)
6754             {
6755               if (comparator == const0_rtx)
6756                 {
6757                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6758                   if (GET_CODE (inner) == ZERO_EXTRACT)
6759                     /* TBZ/TBNZ.  */
6760                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6761                                        ZERO_EXTRACT, 0, speed);
6762                   else
6763                     /* CBZ/CBNZ.  */
6764                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6765
6766                 return true;
6767               }
6768             }
6769           else if (cmpcode == LT || cmpcode == GE)
6770             {
6771               /* TBZ/TBNZ.  */
6772               if (comparator == const0_rtx)
6773                 return true;
6774             }
6775         }
6776     }
6777   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6778     {
6779       /* CCMP.  */
6780       if (GET_CODE (op1) == COMPARE)
6781         {
6782           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6783           if (XEXP (op1, 1) == const0_rtx)
6784             *cost += 1;
6785           if (speed)
6786             {
6787               machine_mode mode = GET_MODE (XEXP (op1, 0));
6788               const struct cpu_cost_table *extra_cost
6789                 = aarch64_tune_params.insn_extra_cost;
6790
6791               if (GET_MODE_CLASS (mode) == MODE_INT)
6792                 *cost += extra_cost->alu.arith;
6793               else
6794                 *cost += extra_cost->fp[mode == DFmode].compare;
6795             }
6796           return true;
6797         }
6798
6799       /* It's a conditional operation based on the status flags,
6800          so it must be some flavor of CSEL.  */
6801
6802       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6803       if (GET_CODE (op1) == NEG
6804           || GET_CODE (op1) == NOT
6805           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6806         op1 = XEXP (op1, 0);
6807       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6808         {
6809           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6810           op1 = XEXP (op1, 0);
6811           op2 = XEXP (op2, 0);
6812         }
6813
6814       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6815       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6816       return true;
6817     }
6818
6819   /* We don't know what this is, cost all operands.  */
6820   return false;
6821 }
6822
6823 /* Check whether X is a bitfield operation of the form shift + extend that
6824    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6825    operand to which the bitfield operation is applied.  Otherwise return
6826    NULL_RTX.  */
6827
6828 static rtx
6829 aarch64_extend_bitfield_pattern_p (rtx x)
6830 {
6831   rtx_code outer_code = GET_CODE (x);
6832   machine_mode outer_mode = GET_MODE (x);
6833
6834   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6835       && outer_mode != SImode && outer_mode != DImode)
6836     return NULL_RTX;
6837
6838   rtx inner = XEXP (x, 0);
6839   rtx_code inner_code = GET_CODE (inner);
6840   machine_mode inner_mode = GET_MODE (inner);
6841   rtx op = NULL_RTX;
6842
6843   switch (inner_code)
6844     {
6845       case ASHIFT:
6846         if (CONST_INT_P (XEXP (inner, 1))
6847             && (inner_mode == QImode || inner_mode == HImode))
6848           op = XEXP (inner, 0);
6849         break;
6850       case LSHIFTRT:
6851         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6852             && (inner_mode == QImode || inner_mode == HImode))
6853           op = XEXP (inner, 0);
6854         break;
6855       case ASHIFTRT:
6856         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6857             && (inner_mode == QImode || inner_mode == HImode))
6858           op = XEXP (inner, 0);
6859         break;
6860       default:
6861         break;
6862     }
6863
6864   return op;
6865 }
6866
6867 /* Return true if the mask and a shift amount from an RTX of the form
6868    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6869    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6870
6871 bool
6872 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
6873                                     rtx shft_amnt)
6874 {
6875   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6876          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6877          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6878          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6879 }
6880
6881 /* Calculate the cost of calculating X, storing it in *COST.  Result
6882    is true if the total cost of the operation has now been calculated.  */
6883 static bool
6884 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6885                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6886 {
6887   rtx op0, op1, op2;
6888   const struct cpu_cost_table *extra_cost
6889     = aarch64_tune_params.insn_extra_cost;
6890   int code = GET_CODE (x);
6891   scalar_int_mode int_mode;
6892
6893   /* By default, assume that everything has equivalent cost to the
6894      cheapest instruction.  Any additional costs are applied as a delta
6895      above this default.  */
6896   *cost = COSTS_N_INSNS (1);
6897
6898   switch (code)
6899     {
6900     case SET:
6901       /* The cost depends entirely on the operands to SET.  */
6902       *cost = 0;
6903       op0 = SET_DEST (x);
6904       op1 = SET_SRC (x);
6905
6906       switch (GET_CODE (op0))
6907         {
6908         case MEM:
6909           if (speed)
6910             {
6911               rtx address = XEXP (op0, 0);
6912               if (VECTOR_MODE_P (mode))
6913                 *cost += extra_cost->ldst.storev;
6914               else if (GET_MODE_CLASS (mode) == MODE_INT)
6915                 *cost += extra_cost->ldst.store;
6916               else if (mode == SFmode)
6917                 *cost += extra_cost->ldst.storef;
6918               else if (mode == DFmode)
6919                 *cost += extra_cost->ldst.stored;
6920
6921               *cost +=
6922                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6923                                                      0, speed));
6924             }
6925
6926           *cost += rtx_cost (op1, mode, SET, 1, speed);
6927           return true;
6928
6929         case SUBREG:
6930           if (! REG_P (SUBREG_REG (op0)))
6931             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6932
6933           /* Fall through.  */
6934         case REG:
6935           /* The cost is one per vector-register copied.  */
6936           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6937             {
6938               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6939                               / GET_MODE_SIZE (V4SImode);
6940               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6941             }
6942           /* const0_rtx is in general free, but we will use an
6943              instruction to set a register to 0.  */
6944           else if (REG_P (op1) || op1 == const0_rtx)
6945             {
6946               /* The cost is 1 per register copied.  */
6947               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6948                               / UNITS_PER_WORD;
6949               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6950             }
6951           else
6952             /* Cost is just the cost of the RHS of the set.  */
6953             *cost += rtx_cost (op1, mode, SET, 1, speed);
6954           return true;
6955
6956         case ZERO_EXTRACT:
6957         case SIGN_EXTRACT:
6958           /* Bit-field insertion.  Strip any redundant widening of
6959              the RHS to meet the width of the target.  */
6960           if (GET_CODE (op1) == SUBREG)
6961             op1 = SUBREG_REG (op1);
6962           if ((GET_CODE (op1) == ZERO_EXTEND
6963                || GET_CODE (op1) == SIGN_EXTEND)
6964               && CONST_INT_P (XEXP (op0, 1))
6965               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
6966               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
6967             op1 = XEXP (op1, 0);
6968
6969           if (CONST_INT_P (op1))
6970             {
6971               /* MOV immediate is assumed to always be cheap.  */
6972               *cost = COSTS_N_INSNS (1);
6973             }
6974           else
6975             {
6976               /* BFM.  */
6977               if (speed)
6978                 *cost += extra_cost->alu.bfi;
6979               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6980             }
6981
6982           return true;
6983
6984         default:
6985           /* We can't make sense of this, assume default cost.  */
6986           *cost = COSTS_N_INSNS (1);
6987           return false;
6988         }
6989       return false;
6990
6991     case CONST_INT:
6992       /* If an instruction can incorporate a constant within the
6993          instruction, the instruction's expression avoids calling
6994          rtx_cost() on the constant.  If rtx_cost() is called on a
6995          constant, then it is usually because the constant must be
6996          moved into a register by one or more instructions.
6997
6998          The exception is constant 0, which can be expressed
6999          as XZR/WZR and is therefore free.  The exception to this is
7000          if we have (set (reg) (const0_rtx)) in which case we must cost
7001          the move.  However, we can catch that when we cost the SET, so
7002          we don't need to consider that here.  */
7003       if (x == const0_rtx)
7004         *cost = 0;
7005       else
7006         {
7007           /* To an approximation, building any other constant is
7008              proportionally expensive to the number of instructions
7009              required to build that constant.  This is true whether we
7010              are compiling for SPEED or otherwise.  */
7011           if (!is_a <scalar_int_mode> (mode, &int_mode))
7012             int_mode = word_mode;
7013           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
7014                                  (NULL_RTX, x, false, int_mode));
7015         }
7016       return true;
7017
7018     case CONST_DOUBLE:
7019
7020       /* First determine number of instructions to do the move
7021           as an integer constant.  */
7022       if (!aarch64_float_const_representable_p (x)
7023            && !aarch64_can_const_movi_rtx_p (x, mode)
7024            && aarch64_float_const_rtx_p (x))
7025         {
7026           unsigned HOST_WIDE_INT ival;
7027           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
7028           gcc_assert (succeed);
7029
7030           scalar_int_mode imode = (mode == HFmode
7031                                    ? SImode
7032                                    : int_mode_for_mode (mode).require ());
7033           int ncost = aarch64_internal_mov_immediate
7034                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7035           *cost += COSTS_N_INSNS (ncost);
7036           return true;
7037         }
7038
7039       if (speed)
7040         {
7041           /* mov[df,sf]_aarch64.  */
7042           if (aarch64_float_const_representable_p (x))
7043             /* FMOV (scalar immediate).  */
7044             *cost += extra_cost->fp[mode == DFmode].fpconst;
7045           else if (!aarch64_float_const_zero_rtx_p (x))
7046             {
7047               /* This will be a load from memory.  */
7048               if (mode == DFmode)
7049                 *cost += extra_cost->ldst.loadd;
7050               else
7051                 *cost += extra_cost->ldst.loadf;
7052             }
7053           else
7054             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7055                or MOV v0.s[0], wzr - neither of which are modeled by the
7056                cost tables.  Just use the default cost.  */
7057             {
7058             }
7059         }
7060
7061       return true;
7062
7063     case MEM:
7064       if (speed)
7065         {
7066           /* For loads we want the base cost of a load, plus an
7067              approximation for the additional cost of the addressing
7068              mode.  */
7069           rtx address = XEXP (x, 0);
7070           if (VECTOR_MODE_P (mode))
7071             *cost += extra_cost->ldst.loadv;
7072           else if (GET_MODE_CLASS (mode) == MODE_INT)
7073             *cost += extra_cost->ldst.load;
7074           else if (mode == SFmode)
7075             *cost += extra_cost->ldst.loadf;
7076           else if (mode == DFmode)
7077             *cost += extra_cost->ldst.loadd;
7078
7079           *cost +=
7080                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7081                                                      0, speed));
7082         }
7083
7084       return true;
7085
7086     case NEG:
7087       op0 = XEXP (x, 0);
7088
7089       if (VECTOR_MODE_P (mode))
7090         {
7091           if (speed)
7092             {
7093               /* FNEG.  */
7094               *cost += extra_cost->vect.alu;
7095             }
7096           return false;
7097         }
7098
7099       if (GET_MODE_CLASS (mode) == MODE_INT)
7100         {
7101           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7102               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7103             {
7104               /* CSETM.  */
7105               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7106               return true;
7107             }
7108
7109           /* Cost this as SUB wzr, X.  */
7110           op0 = CONST0_RTX (mode);
7111           op1 = XEXP (x, 0);
7112           goto cost_minus;
7113         }
7114
7115       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7116         {
7117           /* Support (neg(fma...)) as a single instruction only if
7118              sign of zeros is unimportant.  This matches the decision
7119              making in aarch64.md.  */
7120           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7121             {
7122               /* FNMADD.  */
7123               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7124               return true;
7125             }
7126           if (GET_CODE (op0) == MULT)
7127             {
7128               /* FNMUL.  */
7129               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7130               return true;
7131             }
7132           if (speed)
7133             /* FNEG.  */
7134             *cost += extra_cost->fp[mode == DFmode].neg;
7135           return false;
7136         }
7137
7138       return false;
7139
7140     case CLRSB:
7141     case CLZ:
7142       if (speed)
7143         {
7144           if (VECTOR_MODE_P (mode))
7145             *cost += extra_cost->vect.alu;
7146           else
7147             *cost += extra_cost->alu.clz;
7148         }
7149
7150       return false;
7151
7152     case COMPARE:
7153       op0 = XEXP (x, 0);
7154       op1 = XEXP (x, 1);
7155
7156       if (op1 == const0_rtx
7157           && GET_CODE (op0) == AND)
7158         {
7159           x = op0;
7160           mode = GET_MODE (op0);
7161           goto cost_logic;
7162         }
7163
7164       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7165         {
7166           /* TODO: A write to the CC flags possibly costs extra, this
7167              needs encoding in the cost tables.  */
7168
7169           mode = GET_MODE (op0);
7170           /* ANDS.  */
7171           if (GET_CODE (op0) == AND)
7172             {
7173               x = op0;
7174               goto cost_logic;
7175             }
7176
7177           if (GET_CODE (op0) == PLUS)
7178             {
7179               /* ADDS (and CMN alias).  */
7180               x = op0;
7181               goto cost_plus;
7182             }
7183
7184           if (GET_CODE (op0) == MINUS)
7185             {
7186               /* SUBS.  */
7187               x = op0;
7188               goto cost_minus;
7189             }
7190
7191           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7192               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7193               && CONST_INT_P (XEXP (op0, 2)))
7194             {
7195               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7196                  Handle it here directly rather than going to cost_logic
7197                  since we know the immediate generated for the TST is valid
7198                  so we can avoid creating an intermediate rtx for it only
7199                  for costing purposes.  */
7200               if (speed)
7201                 *cost += extra_cost->alu.logical;
7202
7203               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7204                                  ZERO_EXTRACT, 0, speed);
7205               return true;
7206             }
7207
7208           if (GET_CODE (op1) == NEG)
7209             {
7210               /* CMN.  */
7211               if (speed)
7212                 *cost += extra_cost->alu.arith;
7213
7214               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7215               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7216               return true;
7217             }
7218
7219           /* CMP.
7220
7221              Compare can freely swap the order of operands, and
7222              canonicalization puts the more complex operation first.
7223              But the integer MINUS logic expects the shift/extend
7224              operation in op1.  */
7225           if (! (REG_P (op0)
7226                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7227           {
7228             op0 = XEXP (x, 1);
7229             op1 = XEXP (x, 0);
7230           }
7231           goto cost_minus;
7232         }
7233
7234       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7235         {
7236           /* FCMP.  */
7237           if (speed)
7238             *cost += extra_cost->fp[mode == DFmode].compare;
7239
7240           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7241             {
7242               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7243               /* FCMP supports constant 0.0 for no extra cost. */
7244               return true;
7245             }
7246           return false;
7247         }
7248
7249       if (VECTOR_MODE_P (mode))
7250         {
7251           /* Vector compare.  */
7252           if (speed)
7253             *cost += extra_cost->vect.alu;
7254
7255           if (aarch64_float_const_zero_rtx_p (op1))
7256             {
7257               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7258                  cost.  */
7259               return true;
7260             }
7261           return false;
7262         }
7263       return false;
7264
7265     case MINUS:
7266       {
7267         op0 = XEXP (x, 0);
7268         op1 = XEXP (x, 1);
7269
7270 cost_minus:
7271         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7272
7273         /* Detect valid immediates.  */
7274         if ((GET_MODE_CLASS (mode) == MODE_INT
7275              || (GET_MODE_CLASS (mode) == MODE_CC
7276                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7277             && CONST_INT_P (op1)
7278             && aarch64_uimm12_shift (INTVAL (op1)))
7279           {
7280             if (speed)
7281               /* SUB(S) (immediate).  */
7282               *cost += extra_cost->alu.arith;
7283             return true;
7284           }
7285
7286         /* Look for SUB (extended register).  */
7287         if (is_a <scalar_int_mode> (mode, &int_mode)
7288             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7289           {
7290             if (speed)
7291               *cost += extra_cost->alu.extend_arith;
7292
7293             op1 = aarch64_strip_extend (op1, true);
7294             *cost += rtx_cost (op1, VOIDmode,
7295                                (enum rtx_code) GET_CODE (op1), 0, speed);
7296             return true;
7297           }
7298
7299         rtx new_op1 = aarch64_strip_extend (op1, false);
7300
7301         /* Cost this as an FMA-alike operation.  */
7302         if ((GET_CODE (new_op1) == MULT
7303              || aarch64_shift_p (GET_CODE (new_op1)))
7304             && code != COMPARE)
7305           {
7306             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7307                                             (enum rtx_code) code,
7308                                             speed);
7309             return true;
7310           }
7311
7312         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7313
7314         if (speed)
7315           {
7316             if (VECTOR_MODE_P (mode))
7317               {
7318                 /* Vector SUB.  */
7319                 *cost += extra_cost->vect.alu;
7320               }
7321             else if (GET_MODE_CLASS (mode) == MODE_INT)
7322               {
7323                 /* SUB(S).  */
7324                 *cost += extra_cost->alu.arith;
7325               }
7326             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7327               {
7328                 /* FSUB.  */
7329                 *cost += extra_cost->fp[mode == DFmode].addsub;
7330               }
7331           }
7332         return true;
7333       }
7334
7335     case PLUS:
7336       {
7337         rtx new_op0;
7338
7339         op0 = XEXP (x, 0);
7340         op1 = XEXP (x, 1);
7341
7342 cost_plus:
7343         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7344             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7345           {
7346             /* CSINC.  */
7347             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7348             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7349             return true;
7350           }
7351
7352         if (GET_MODE_CLASS (mode) == MODE_INT
7353             && CONST_INT_P (op1)
7354             && aarch64_uimm12_shift (INTVAL (op1)))
7355           {
7356             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7357
7358             if (speed)
7359               /* ADD (immediate).  */
7360               *cost += extra_cost->alu.arith;
7361             return true;
7362           }
7363
7364         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7365
7366         /* Look for ADD (extended register).  */
7367         if (is_a <scalar_int_mode> (mode, &int_mode)
7368             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7369           {
7370             if (speed)
7371               *cost += extra_cost->alu.extend_arith;
7372
7373             op0 = aarch64_strip_extend (op0, true);
7374             *cost += rtx_cost (op0, VOIDmode,
7375                                (enum rtx_code) GET_CODE (op0), 0, speed);
7376             return true;
7377           }
7378
7379         /* Strip any extend, leave shifts behind as we will
7380            cost them through mult_cost.  */
7381         new_op0 = aarch64_strip_extend (op0, false);
7382
7383         if (GET_CODE (new_op0) == MULT
7384             || aarch64_shift_p (GET_CODE (new_op0)))
7385           {
7386             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7387                                             speed);
7388             return true;
7389           }
7390
7391         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7392
7393         if (speed)
7394           {
7395             if (VECTOR_MODE_P (mode))
7396               {
7397                 /* Vector ADD.  */
7398                 *cost += extra_cost->vect.alu;
7399               }
7400             else if (GET_MODE_CLASS (mode) == MODE_INT)
7401               {
7402                 /* ADD.  */
7403                 *cost += extra_cost->alu.arith;
7404               }
7405             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7406               {
7407                 /* FADD.  */
7408                 *cost += extra_cost->fp[mode == DFmode].addsub;
7409               }
7410           }
7411         return true;
7412       }
7413
7414     case BSWAP:
7415       *cost = COSTS_N_INSNS (1);
7416
7417       if (speed)
7418         {
7419           if (VECTOR_MODE_P (mode))
7420             *cost += extra_cost->vect.alu;
7421           else
7422             *cost += extra_cost->alu.rev;
7423         }
7424       return false;
7425
7426     case IOR:
7427       if (aarch_rev16_p (x))
7428         {
7429           *cost = COSTS_N_INSNS (1);
7430
7431           if (speed)
7432             {
7433               if (VECTOR_MODE_P (mode))
7434                 *cost += extra_cost->vect.alu;
7435               else
7436                 *cost += extra_cost->alu.rev;
7437             }
7438           return true;
7439         }
7440
7441       if (aarch64_extr_rtx_p (x, &op0, &op1))
7442         {
7443           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7444           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7445           if (speed)
7446             *cost += extra_cost->alu.shift;
7447
7448           return true;
7449         }
7450     /* Fall through.  */
7451     case XOR:
7452     case AND:
7453     cost_logic:
7454       op0 = XEXP (x, 0);
7455       op1 = XEXP (x, 1);
7456
7457       if (VECTOR_MODE_P (mode))
7458         {
7459           if (speed)
7460             *cost += extra_cost->vect.alu;
7461           return true;
7462         }
7463
7464       if (code == AND
7465           && GET_CODE (op0) == MULT
7466           && CONST_INT_P (XEXP (op0, 1))
7467           && CONST_INT_P (op1)
7468           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7469                                INTVAL (op1)) != 0)
7470         {
7471           /* This is a UBFM/SBFM.  */
7472           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7473           if (speed)
7474             *cost += extra_cost->alu.bfx;
7475           return true;
7476         }
7477
7478       if (is_int_mode (mode, &int_mode))
7479         {
7480           if (CONST_INT_P (op1))
7481             {
7482               /* We have a mask + shift version of a UBFIZ
7483                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7484               if (GET_CODE (op0) == ASHIFT
7485                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
7486                                                          XEXP (op0, 1)))
7487                 {
7488                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
7489                                      (enum rtx_code) code, 0, speed);
7490                   if (speed)
7491                     *cost += extra_cost->alu.bfx;
7492
7493                   return true;
7494                 }
7495               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
7496                 {
7497                 /* We possibly get the immediate for free, this is not
7498                    modelled.  */
7499                   *cost += rtx_cost (op0, int_mode,
7500                                      (enum rtx_code) code, 0, speed);
7501                   if (speed)
7502                     *cost += extra_cost->alu.logical;
7503
7504                   return true;
7505                 }
7506             }
7507           else
7508             {
7509               rtx new_op0 = op0;
7510
7511               /* Handle ORN, EON, or BIC.  */
7512               if (GET_CODE (op0) == NOT)
7513                 op0 = XEXP (op0, 0);
7514
7515               new_op0 = aarch64_strip_shift (op0);
7516
7517               /* If we had a shift on op0 then this is a logical-shift-
7518                  by-register/immediate operation.  Otherwise, this is just
7519                  a logical operation.  */
7520               if (speed)
7521                 {
7522                   if (new_op0 != op0)
7523                     {
7524                       /* Shift by immediate.  */
7525                       if (CONST_INT_P (XEXP (op0, 1)))
7526                         *cost += extra_cost->alu.log_shift;
7527                       else
7528                         *cost += extra_cost->alu.log_shift_reg;
7529                     }
7530                   else
7531                     *cost += extra_cost->alu.logical;
7532                 }
7533
7534               /* In both cases we want to cost both operands.  */
7535               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
7536                                  0, speed);
7537               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
7538                                  1, speed);
7539
7540               return true;
7541             }
7542         }
7543       return false;
7544
7545     case NOT:
7546       x = XEXP (x, 0);
7547       op0 = aarch64_strip_shift (x);
7548
7549       if (VECTOR_MODE_P (mode))
7550         {
7551           /* Vector NOT.  */
7552           *cost += extra_cost->vect.alu;
7553           return false;
7554         }
7555
7556       /* MVN-shifted-reg.  */
7557       if (op0 != x)
7558         {
7559           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7560
7561           if (speed)
7562             *cost += extra_cost->alu.log_shift;
7563
7564           return true;
7565         }
7566       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7567          Handle the second form here taking care that 'a' in the above can
7568          be a shift.  */
7569       else if (GET_CODE (op0) == XOR)
7570         {
7571           rtx newop0 = XEXP (op0, 0);
7572           rtx newop1 = XEXP (op0, 1);
7573           rtx op0_stripped = aarch64_strip_shift (newop0);
7574
7575           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7576           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7577
7578           if (speed)
7579             {
7580               if (op0_stripped != newop0)
7581                 *cost += extra_cost->alu.log_shift;
7582               else
7583                 *cost += extra_cost->alu.logical;
7584             }
7585
7586           return true;
7587         }
7588       /* MVN.  */
7589       if (speed)
7590         *cost += extra_cost->alu.logical;
7591
7592       return false;
7593
7594     case ZERO_EXTEND:
7595
7596       op0 = XEXP (x, 0);
7597       /* If a value is written in SI mode, then zero extended to DI
7598          mode, the operation will in general be free as a write to
7599          a 'w' register implicitly zeroes the upper bits of an 'x'
7600          register.  However, if this is
7601
7602            (set (reg) (zero_extend (reg)))
7603
7604          we must cost the explicit register move.  */
7605       if (mode == DImode
7606           && GET_MODE (op0) == SImode
7607           && outer == SET)
7608         {
7609           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7610
7611         /* If OP_COST is non-zero, then the cost of the zero extend
7612            is effectively the cost of the inner operation.  Otherwise
7613            we have a MOV instruction and we take the cost from the MOV
7614            itself.  This is true independently of whether we are
7615            optimizing for space or time.  */
7616           if (op_cost)
7617             *cost = op_cost;
7618
7619           return true;
7620         }
7621       else if (MEM_P (op0))
7622         {
7623           /* All loads can zero extend to any size for free.  */
7624           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7625           return true;
7626         }
7627
7628       op0 = aarch64_extend_bitfield_pattern_p (x);
7629       if (op0)
7630         {
7631           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7632           if (speed)
7633             *cost += extra_cost->alu.bfx;
7634           return true;
7635         }
7636
7637       if (speed)
7638         {
7639           if (VECTOR_MODE_P (mode))
7640             {
7641               /* UMOV.  */
7642               *cost += extra_cost->vect.alu;
7643             }
7644           else
7645             {
7646               /* We generate an AND instead of UXTB/UXTH.  */
7647               *cost += extra_cost->alu.logical;
7648             }
7649         }
7650       return false;
7651
7652     case SIGN_EXTEND:
7653       if (MEM_P (XEXP (x, 0)))
7654         {
7655           /* LDRSH.  */
7656           if (speed)
7657             {
7658               rtx address = XEXP (XEXP (x, 0), 0);
7659               *cost += extra_cost->ldst.load_sign_extend;
7660
7661               *cost +=
7662                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7663                                                      0, speed));
7664             }
7665           return true;
7666         }
7667
7668       op0 = aarch64_extend_bitfield_pattern_p (x);
7669       if (op0)
7670         {
7671           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7672           if (speed)
7673             *cost += extra_cost->alu.bfx;
7674           return true;
7675         }
7676
7677       if (speed)
7678         {
7679           if (VECTOR_MODE_P (mode))
7680             *cost += extra_cost->vect.alu;
7681           else
7682             *cost += extra_cost->alu.extend;
7683         }
7684       return false;
7685
7686     case ASHIFT:
7687       op0 = XEXP (x, 0);
7688       op1 = XEXP (x, 1);
7689
7690       if (CONST_INT_P (op1))
7691         {
7692           if (speed)
7693             {
7694               if (VECTOR_MODE_P (mode))
7695                 {
7696                   /* Vector shift (immediate).  */
7697                   *cost += extra_cost->vect.alu;
7698                 }
7699               else
7700                 {
7701                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7702                      aliases.  */
7703                   *cost += extra_cost->alu.shift;
7704                 }
7705             }
7706
7707           /* We can incorporate zero/sign extend for free.  */
7708           if (GET_CODE (op0) == ZERO_EXTEND
7709               || GET_CODE (op0) == SIGN_EXTEND)
7710             op0 = XEXP (op0, 0);
7711
7712           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7713           return true;
7714         }
7715       else
7716         {
7717           if (VECTOR_MODE_P (mode))
7718             {
7719               if (speed)
7720                 /* Vector shift (register).  */
7721                 *cost += extra_cost->vect.alu;
7722             }
7723           else
7724             {
7725               if (speed)
7726                 /* LSLV.  */
7727                 *cost += extra_cost->alu.shift_reg;
7728
7729               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7730                   && CONST_INT_P (XEXP (op1, 1))
7731                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7732                 {
7733                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7734                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7735                      don't recurse into it.  */
7736                   return true;
7737                 }
7738             }
7739           return false;  /* All arguments need to be in registers.  */
7740         }
7741
7742     case ROTATE:
7743     case ROTATERT:
7744     case LSHIFTRT:
7745     case ASHIFTRT:
7746       op0 = XEXP (x, 0);
7747       op1 = XEXP (x, 1);
7748
7749       if (CONST_INT_P (op1))
7750         {
7751           /* ASR (immediate) and friends.  */
7752           if (speed)
7753             {
7754               if (VECTOR_MODE_P (mode))
7755                 *cost += extra_cost->vect.alu;
7756               else
7757                 *cost += extra_cost->alu.shift;
7758             }
7759
7760           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7761           return true;
7762         }
7763       else
7764         {
7765           if (VECTOR_MODE_P (mode))
7766             {
7767               if (speed)
7768                 /* Vector shift (register).  */
7769                 *cost += extra_cost->vect.alu;
7770             }
7771           else
7772             {
7773               if (speed)
7774                 /* ASR (register) and friends.  */
7775                 *cost += extra_cost->alu.shift_reg;
7776
7777               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7778                   && CONST_INT_P (XEXP (op1, 1))
7779                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7780                 {
7781                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7782                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7783                      don't recurse into it.  */
7784                   return true;
7785                 }
7786             }
7787           return false;  /* All arguments need to be in registers.  */
7788         }
7789
7790     case SYMBOL_REF:
7791
7792       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7793           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7794         {
7795           /* LDR.  */
7796           if (speed)
7797             *cost += extra_cost->ldst.load;
7798         }
7799       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7800                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7801         {
7802           /* ADRP, followed by ADD.  */
7803           *cost += COSTS_N_INSNS (1);
7804           if (speed)
7805             *cost += 2 * extra_cost->alu.arith;
7806         }
7807       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7808                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7809         {
7810           /* ADR.  */
7811           if (speed)
7812             *cost += extra_cost->alu.arith;
7813         }
7814
7815       if (flag_pic)
7816         {
7817           /* One extra load instruction, after accessing the GOT.  */
7818           *cost += COSTS_N_INSNS (1);
7819           if (speed)
7820             *cost += extra_cost->ldst.load;
7821         }
7822       return true;
7823
7824     case HIGH:
7825     case LO_SUM:
7826       /* ADRP/ADD (immediate).  */
7827       if (speed)
7828         *cost += extra_cost->alu.arith;
7829       return true;
7830
7831     case ZERO_EXTRACT:
7832     case SIGN_EXTRACT:
7833       /* UBFX/SBFX.  */
7834       if (speed)
7835         {
7836           if (VECTOR_MODE_P (mode))
7837             *cost += extra_cost->vect.alu;
7838           else
7839             *cost += extra_cost->alu.bfx;
7840         }
7841
7842       /* We can trust that the immediates used will be correct (there
7843          are no by-register forms), so we need only cost op0.  */
7844       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7845       return true;
7846
7847     case MULT:
7848       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7849       /* aarch64_rtx_mult_cost always handles recursion to its
7850          operands.  */
7851       return true;
7852
7853     case MOD:
7854     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7855        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7856        an unconditional negate.  This case should only ever be reached through
7857        the set_smod_pow2_cheap check in expmed.c.  */
7858       if (CONST_INT_P (XEXP (x, 1))
7859           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7860           && (mode == SImode || mode == DImode))
7861         {
7862           /* We expand to 4 instructions.  Reset the baseline.  */
7863           *cost = COSTS_N_INSNS (4);
7864
7865           if (speed)
7866             *cost += 2 * extra_cost->alu.logical
7867                      + 2 * extra_cost->alu.arith;
7868
7869           return true;
7870         }
7871
7872     /* Fall-through.  */
7873     case UMOD:
7874       if (speed)
7875         {
7876           /* Slighly prefer UMOD over SMOD.  */
7877           if (VECTOR_MODE_P (mode))
7878             *cost += extra_cost->vect.alu;
7879           else if (GET_MODE_CLASS (mode) == MODE_INT)
7880             *cost += (extra_cost->mult[mode == DImode].add
7881                       + extra_cost->mult[mode == DImode].idiv
7882                       + (code == MOD ? 1 : 0));
7883         }
7884       return false;  /* All arguments need to be in registers.  */
7885
7886     case DIV:
7887     case UDIV:
7888     case SQRT:
7889       if (speed)
7890         {
7891           if (VECTOR_MODE_P (mode))
7892             *cost += extra_cost->vect.alu;
7893           else if (GET_MODE_CLASS (mode) == MODE_INT)
7894             /* There is no integer SQRT, so only DIV and UDIV can get
7895                here.  */
7896             *cost += (extra_cost->mult[mode == DImode].idiv
7897                      /* Slighly prefer UDIV over SDIV.  */
7898                      + (code == DIV ? 1 : 0));
7899           else
7900             *cost += extra_cost->fp[mode == DFmode].div;
7901         }
7902       return false;  /* All arguments need to be in registers.  */
7903
7904     case IF_THEN_ELSE:
7905       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7906                                          XEXP (x, 2), cost, speed);
7907
7908     case EQ:
7909     case NE:
7910     case GT:
7911     case GTU:
7912     case LT:
7913     case LTU:
7914     case GE:
7915     case GEU:
7916     case LE:
7917     case LEU:
7918
7919       return false; /* All arguments must be in registers.  */
7920
7921     case FMA:
7922       op0 = XEXP (x, 0);
7923       op1 = XEXP (x, 1);
7924       op2 = XEXP (x, 2);
7925
7926       if (speed)
7927         {
7928           if (VECTOR_MODE_P (mode))
7929             *cost += extra_cost->vect.alu;
7930           else
7931             *cost += extra_cost->fp[mode == DFmode].fma;
7932         }
7933
7934       /* FMSUB, FNMADD, and FNMSUB are free.  */
7935       if (GET_CODE (op0) == NEG)
7936         op0 = XEXP (op0, 0);
7937
7938       if (GET_CODE (op2) == NEG)
7939         op2 = XEXP (op2, 0);
7940
7941       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7942          and the by-element operand as operand 0.  */
7943       if (GET_CODE (op1) == NEG)
7944         op1 = XEXP (op1, 0);
7945
7946       /* Catch vector-by-element operations.  The by-element operand can
7947          either be (vec_duplicate (vec_select (x))) or just
7948          (vec_select (x)), depending on whether we are multiplying by
7949          a vector or a scalar.
7950
7951          Canonicalization is not very good in these cases, FMA4 will put the
7952          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7953       if (GET_CODE (op0) == VEC_DUPLICATE)
7954         op0 = XEXP (op0, 0);
7955       else if (GET_CODE (op1) == VEC_DUPLICATE)
7956         op1 = XEXP (op1, 0);
7957
7958       if (GET_CODE (op0) == VEC_SELECT)
7959         op0 = XEXP (op0, 0);
7960       else if (GET_CODE (op1) == VEC_SELECT)
7961         op1 = XEXP (op1, 0);
7962
7963       /* If the remaining parameters are not registers,
7964          get the cost to put them into registers.  */
7965       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7966       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7967       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7968       return true;
7969
7970     case FLOAT:
7971     case UNSIGNED_FLOAT:
7972       if (speed)
7973         *cost += extra_cost->fp[mode == DFmode].fromint;
7974       return false;
7975
7976     case FLOAT_EXTEND:
7977       if (speed)
7978         {
7979           if (VECTOR_MODE_P (mode))
7980             {
7981               /*Vector truncate.  */
7982               *cost += extra_cost->vect.alu;
7983             }
7984           else
7985             *cost += extra_cost->fp[mode == DFmode].widen;
7986         }
7987       return false;
7988
7989     case FLOAT_TRUNCATE:
7990       if (speed)
7991         {
7992           if (VECTOR_MODE_P (mode))
7993             {
7994               /*Vector conversion.  */
7995               *cost += extra_cost->vect.alu;
7996             }
7997           else
7998             *cost += extra_cost->fp[mode == DFmode].narrow;
7999         }
8000       return false;
8001
8002     case FIX:
8003     case UNSIGNED_FIX:
8004       x = XEXP (x, 0);
8005       /* Strip the rounding part.  They will all be implemented
8006          by the fcvt* family of instructions anyway.  */
8007       if (GET_CODE (x) == UNSPEC)
8008         {
8009           unsigned int uns_code = XINT (x, 1);
8010
8011           if (uns_code == UNSPEC_FRINTA
8012               || uns_code == UNSPEC_FRINTM
8013               || uns_code == UNSPEC_FRINTN
8014               || uns_code == UNSPEC_FRINTP
8015               || uns_code == UNSPEC_FRINTZ)
8016             x = XVECEXP (x, 0, 0);
8017         }
8018
8019       if (speed)
8020         {
8021           if (VECTOR_MODE_P (mode))
8022             *cost += extra_cost->vect.alu;
8023           else
8024             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
8025         }
8026
8027       /* We can combine fmul by a power of 2 followed by a fcvt into a single
8028          fixed-point fcvt.  */
8029       if (GET_CODE (x) == MULT
8030           && ((VECTOR_MODE_P (mode)
8031                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
8032               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
8033         {
8034           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
8035                              0, speed);
8036           return true;
8037         }
8038
8039       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
8040       return true;
8041
8042     case ABS:
8043       if (VECTOR_MODE_P (mode))
8044         {
8045           /* ABS (vector).  */
8046           if (speed)
8047             *cost += extra_cost->vect.alu;
8048         }
8049       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8050         {
8051           op0 = XEXP (x, 0);
8052
8053           /* FABD, which is analogous to FADD.  */
8054           if (GET_CODE (op0) == MINUS)
8055             {
8056               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8057               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8058               if (speed)
8059                 *cost += extra_cost->fp[mode == DFmode].addsub;
8060
8061               return true;
8062             }
8063           /* Simple FABS is analogous to FNEG.  */
8064           if (speed)
8065             *cost += extra_cost->fp[mode == DFmode].neg;
8066         }
8067       else
8068         {
8069           /* Integer ABS will either be split to
8070              two arithmetic instructions, or will be an ABS
8071              (scalar), which we don't model.  */
8072           *cost = COSTS_N_INSNS (2);
8073           if (speed)
8074             *cost += 2 * extra_cost->alu.arith;
8075         }
8076       return false;
8077
8078     case SMAX:
8079     case SMIN:
8080       if (speed)
8081         {
8082           if (VECTOR_MODE_P (mode))
8083             *cost += extra_cost->vect.alu;
8084           else
8085             {
8086               /* FMAXNM/FMINNM/FMAX/FMIN.
8087                  TODO: This may not be accurate for all implementations, but
8088                  we do not model this in the cost tables.  */
8089               *cost += extra_cost->fp[mode == DFmode].addsub;
8090             }
8091         }
8092       return false;
8093
8094     case UNSPEC:
8095       /* The floating point round to integer frint* instructions.  */
8096       if (aarch64_frint_unspec_p (XINT (x, 1)))
8097         {
8098           if (speed)
8099             *cost += extra_cost->fp[mode == DFmode].roundint;
8100
8101           return false;
8102         }
8103
8104       if (XINT (x, 1) == UNSPEC_RBIT)
8105         {
8106           if (speed)
8107             *cost += extra_cost->alu.rev;
8108
8109           return false;
8110         }
8111       break;
8112
8113     case TRUNCATE:
8114
8115       /* Decompose <su>muldi3_highpart.  */
8116       if (/* (truncate:DI  */
8117           mode == DImode
8118           /*   (lshiftrt:TI  */
8119           && GET_MODE (XEXP (x, 0)) == TImode
8120           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8121           /*      (mult:TI  */
8122           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8123           /*        (ANY_EXTEND:TI (reg:DI))
8124                     (ANY_EXTEND:TI (reg:DI)))  */
8125           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8126                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8127               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8128                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8129           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8130           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8131           /*     (const_int 64)  */
8132           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8133           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8134         {
8135           /* UMULH/SMULH.  */
8136           if (speed)
8137             *cost += extra_cost->mult[mode == DImode].extend;
8138           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8139                              mode, MULT, 0, speed);
8140           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8141                              mode, MULT, 1, speed);
8142           return true;
8143         }
8144
8145       /* Fall through.  */
8146     default:
8147       break;
8148     }
8149
8150   if (dump_file
8151       && flag_aarch64_verbose_cost)
8152     fprintf (dump_file,
8153       "\nFailed to cost RTX.  Assuming default cost.\n");
8154
8155   return true;
8156 }
8157
8158 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8159    calculated for X.  This cost is stored in *COST.  Returns true
8160    if the total cost of X was calculated.  */
8161 static bool
8162 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8163                    int param, int *cost, bool speed)
8164 {
8165   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8166
8167   if (dump_file
8168       && flag_aarch64_verbose_cost)
8169     {
8170       print_rtl_single (dump_file, x);
8171       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8172                speed ? "Hot" : "Cold",
8173                *cost, result ? "final" : "partial");
8174     }
8175
8176   return result;
8177 }
8178
8179 static int
8180 aarch64_register_move_cost (machine_mode mode,
8181                             reg_class_t from_i, reg_class_t to_i)
8182 {
8183   enum reg_class from = (enum reg_class) from_i;
8184   enum reg_class to = (enum reg_class) to_i;
8185   const struct cpu_regmove_cost *regmove_cost
8186     = aarch64_tune_params.regmove_cost;
8187
8188   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8189   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8190     to = GENERAL_REGS;
8191
8192   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8193     from = GENERAL_REGS;
8194
8195   /* Moving between GPR and stack cost is the same as GP2GP.  */
8196   if ((from == GENERAL_REGS && to == STACK_REG)
8197       || (to == GENERAL_REGS && from == STACK_REG))
8198     return regmove_cost->GP2GP;
8199
8200   /* To/From the stack register, we move via the gprs.  */
8201   if (to == STACK_REG || from == STACK_REG)
8202     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8203             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8204
8205   if (GET_MODE_SIZE (mode) == 16)
8206     {
8207       /* 128-bit operations on general registers require 2 instructions.  */
8208       if (from == GENERAL_REGS && to == GENERAL_REGS)
8209         return regmove_cost->GP2GP * 2;
8210       else if (from == GENERAL_REGS)
8211         return regmove_cost->GP2FP * 2;
8212       else if (to == GENERAL_REGS)
8213         return regmove_cost->FP2GP * 2;
8214
8215       /* When AdvSIMD instructions are disabled it is not possible to move
8216          a 128-bit value directly between Q registers.  This is handled in
8217          secondary reload.  A general register is used as a scratch to move
8218          the upper DI value and the lower DI value is moved directly,
8219          hence the cost is the sum of three moves. */
8220       if (! TARGET_SIMD)
8221         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8222
8223       return regmove_cost->FP2FP;
8224     }
8225
8226   if (from == GENERAL_REGS && to == GENERAL_REGS)
8227     return regmove_cost->GP2GP;
8228   else if (from == GENERAL_REGS)
8229     return regmove_cost->GP2FP;
8230   else if (to == GENERAL_REGS)
8231     return regmove_cost->FP2GP;
8232
8233   return regmove_cost->FP2FP;
8234 }
8235
8236 static int
8237 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8238                           reg_class_t rclass ATTRIBUTE_UNUSED,
8239                           bool in ATTRIBUTE_UNUSED)
8240 {
8241   return aarch64_tune_params.memmov_cost;
8242 }
8243
8244 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8245    to optimize 1.0/sqrt.  */
8246
8247 static bool
8248 use_rsqrt_p (machine_mode mode)
8249 {
8250   return (!flag_trapping_math
8251           && flag_unsafe_math_optimizations
8252           && ((aarch64_tune_params.approx_modes->recip_sqrt
8253                & AARCH64_APPROX_MODE (mode))
8254               || flag_mrecip_low_precision_sqrt));
8255 }
8256
8257 /* Function to decide when to use the approximate reciprocal square root
8258    builtin.  */
8259
8260 static tree
8261 aarch64_builtin_reciprocal (tree fndecl)
8262 {
8263   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8264
8265   if (!use_rsqrt_p (mode))
8266     return NULL_TREE;
8267   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8268 }
8269
8270 typedef rtx (*rsqrte_type) (rtx, rtx);
8271
8272 /* Select reciprocal square root initial estimate insn depending on machine
8273    mode.  */
8274
8275 static rsqrte_type
8276 get_rsqrte_type (machine_mode mode)
8277 {
8278   switch (mode)
8279   {
8280     case E_DFmode:   return gen_aarch64_rsqrtedf;
8281     case E_SFmode:   return gen_aarch64_rsqrtesf;
8282     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8283     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8284     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8285     default: gcc_unreachable ();
8286   }
8287 }
8288
8289 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8290
8291 /* Select reciprocal square root series step insn depending on machine mode.  */
8292
8293 static rsqrts_type
8294 get_rsqrts_type (machine_mode mode)
8295 {
8296   switch (mode)
8297   {
8298     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8299     case E_SFmode:   return gen_aarch64_rsqrtssf;
8300     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8301     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8302     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8303     default: gcc_unreachable ();
8304   }
8305 }
8306
8307 /* Emit instruction sequence to compute either the approximate square root
8308    or its approximate reciprocal, depending on the flag RECP, and return
8309    whether the sequence was emitted or not.  */
8310
8311 bool
8312 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8313 {
8314   machine_mode mode = GET_MODE (dst);
8315
8316   if (GET_MODE_INNER (mode) == HFmode)
8317     {
8318       gcc_assert (!recp);
8319       return false;
8320     }
8321
8322   if (!recp)
8323     {
8324       if (!(flag_mlow_precision_sqrt
8325             || (aarch64_tune_params.approx_modes->sqrt
8326                 & AARCH64_APPROX_MODE (mode))))
8327         return false;
8328
8329       if (flag_finite_math_only
8330           || flag_trapping_math
8331           || !flag_unsafe_math_optimizations
8332           || optimize_function_for_size_p (cfun))
8333         return false;
8334     }
8335   else
8336     /* Caller assumes we cannot fail.  */
8337     gcc_assert (use_rsqrt_p (mode));
8338
8339   machine_mode mmsk = mode_for_int_vector (mode).require ();
8340   rtx xmsk = gen_reg_rtx (mmsk);
8341   if (!recp)
8342     /* When calculating the approximate square root, compare the
8343        argument with 0.0 and create a mask.  */
8344     emit_insn (gen_rtx_SET (xmsk,
8345                             gen_rtx_NEG (mmsk,
8346                                          gen_rtx_EQ (mmsk, src,
8347                                                      CONST0_RTX (mode)))));
8348
8349   /* Estimate the approximate reciprocal square root.  */
8350   rtx xdst = gen_reg_rtx (mode);
8351   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8352
8353   /* Iterate over the series twice for SF and thrice for DF.  */
8354   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8355
8356   /* Optionally iterate over the series once less for faster performance
8357      while sacrificing the accuracy.  */
8358   if ((recp && flag_mrecip_low_precision_sqrt)
8359       || (!recp && flag_mlow_precision_sqrt))
8360     iterations--;
8361
8362   /* Iterate over the series to calculate the approximate reciprocal square
8363      root.  */
8364   rtx x1 = gen_reg_rtx (mode);
8365   while (iterations--)
8366     {
8367       rtx x2 = gen_reg_rtx (mode);
8368       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8369
8370       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8371
8372       if (iterations > 0)
8373         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8374     }
8375
8376   if (!recp)
8377     {
8378       /* Qualify the approximate reciprocal square root when the argument is
8379          0.0 by squashing the intermediary result to 0.0.  */
8380       rtx xtmp = gen_reg_rtx (mmsk);
8381       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8382                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8383       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8384
8385       /* Calculate the approximate square root.  */
8386       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8387     }
8388
8389   /* Finalize the approximation.  */
8390   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8391
8392   return true;
8393 }
8394
8395 typedef rtx (*recpe_type) (rtx, rtx);
8396
8397 /* Select reciprocal initial estimate insn depending on machine mode.  */
8398
8399 static recpe_type
8400 get_recpe_type (machine_mode mode)
8401 {
8402   switch (mode)
8403   {
8404     case E_SFmode:   return (gen_aarch64_frecpesf);
8405     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8406     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8407     case E_DFmode:   return (gen_aarch64_frecpedf);
8408     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8409     default:         gcc_unreachable ();
8410   }
8411 }
8412
8413 typedef rtx (*recps_type) (rtx, rtx, rtx);
8414
8415 /* Select reciprocal series step insn depending on machine mode.  */
8416
8417 static recps_type
8418 get_recps_type (machine_mode mode)
8419 {
8420   switch (mode)
8421   {
8422     case E_SFmode:   return (gen_aarch64_frecpssf);
8423     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8424     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8425     case E_DFmode:   return (gen_aarch64_frecpsdf);
8426     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8427     default:         gcc_unreachable ();
8428   }
8429 }
8430
8431 /* Emit the instruction sequence to compute the approximation for the division
8432    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8433
8434 bool
8435 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8436 {
8437   machine_mode mode = GET_MODE (quo);
8438
8439   if (GET_MODE_INNER (mode) == HFmode)
8440     return false;
8441
8442   bool use_approx_division_p = (flag_mlow_precision_div
8443                                 || (aarch64_tune_params.approx_modes->division
8444                                     & AARCH64_APPROX_MODE (mode)));
8445
8446   if (!flag_finite_math_only
8447       || flag_trapping_math
8448       || !flag_unsafe_math_optimizations
8449       || optimize_function_for_size_p (cfun)
8450       || !use_approx_division_p)
8451     return false;
8452
8453   /* Estimate the approximate reciprocal.  */
8454   rtx xrcp = gen_reg_rtx (mode);
8455   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8456
8457   /* Iterate over the series twice for SF and thrice for DF.  */
8458   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8459
8460   /* Optionally iterate over the series once less for faster performance,
8461      while sacrificing the accuracy.  */
8462   if (flag_mlow_precision_div)
8463     iterations--;
8464
8465   /* Iterate over the series to calculate the approximate reciprocal.  */
8466   rtx xtmp = gen_reg_rtx (mode);
8467   while (iterations--)
8468     {
8469       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8470
8471       if (iterations > 0)
8472         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8473     }
8474
8475   if (num != CONST1_RTX (mode))
8476     {
8477       /* As the approximate reciprocal of DEN is already calculated, only
8478          calculate the approximate division when NUM is not 1.0.  */
8479       rtx xnum = force_reg (mode, num);
8480       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8481     }
8482
8483   /* Finalize the approximation.  */
8484   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8485   return true;
8486 }
8487
8488 /* Return the number of instructions that can be issued per cycle.  */
8489 static int
8490 aarch64_sched_issue_rate (void)
8491 {
8492   return aarch64_tune_params.issue_rate;
8493 }
8494
8495 static int
8496 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8497 {
8498   int issue_rate = aarch64_sched_issue_rate ();
8499
8500   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8501 }
8502
8503
8504 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8505    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8506    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8507
8508 static int
8509 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8510                                                     int ready_index)
8511 {
8512   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8513 }
8514
8515
8516 /* Vectorizer cost model target hooks.  */
8517
8518 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8519 static int
8520 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8521                                     tree vectype,
8522                                     int misalign ATTRIBUTE_UNUSED)
8523 {
8524   unsigned elements;
8525   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8526   bool fp = false;
8527
8528   if (vectype != NULL)
8529     fp = FLOAT_TYPE_P (vectype);
8530
8531   switch (type_of_cost)
8532     {
8533       case scalar_stmt:
8534         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8535
8536       case scalar_load:
8537         return costs->scalar_load_cost;
8538
8539       case scalar_store:
8540         return costs->scalar_store_cost;
8541
8542       case vector_stmt:
8543         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8544
8545       case vector_load:
8546         return costs->vec_align_load_cost;
8547
8548       case vector_store:
8549         return costs->vec_store_cost;
8550
8551       case vec_to_scalar:
8552         return costs->vec_to_scalar_cost;
8553
8554       case scalar_to_vec:
8555         return costs->scalar_to_vec_cost;
8556
8557       case unaligned_load:
8558         return costs->vec_unalign_load_cost;
8559
8560       case unaligned_store:
8561         return costs->vec_unalign_store_cost;
8562
8563       case cond_branch_taken:
8564         return costs->cond_taken_branch_cost;
8565
8566       case cond_branch_not_taken:
8567         return costs->cond_not_taken_branch_cost;
8568
8569       case vec_perm:
8570         return costs->vec_permute_cost;
8571
8572       case vec_promote_demote:
8573         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8574
8575       case vec_construct:
8576         elements = TYPE_VECTOR_SUBPARTS (vectype);
8577         return elements / 2 + 1;
8578
8579       default:
8580         gcc_unreachable ();
8581     }
8582 }
8583
8584 /* Implement targetm.vectorize.add_stmt_cost.  */
8585 static unsigned
8586 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8587                        struct _stmt_vec_info *stmt_info, int misalign,
8588                        enum vect_cost_model_location where)
8589 {
8590   unsigned *cost = (unsigned *) data;
8591   unsigned retval = 0;
8592
8593   if (flag_vect_cost_model)
8594     {
8595       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8596       int stmt_cost =
8597             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8598
8599       /* Statements in an inner loop relative to the loop being
8600          vectorized are weighted more heavily.  The value here is
8601          arbitrary and could potentially be improved with analysis.  */
8602       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8603         count *= 50; /*  FIXME  */
8604
8605       retval = (unsigned) (count * stmt_cost);
8606       cost[where] += retval;
8607     }
8608
8609   return retval;
8610 }
8611
8612 static void initialize_aarch64_code_model (struct gcc_options *);
8613
8614 /* Parse the TO_PARSE string and put the architecture struct that it
8615    selects into RES and the architectural features into ISA_FLAGS.
8616    Return an aarch64_parse_opt_result describing the parse result.
8617    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8618
8619 static enum aarch64_parse_opt_result
8620 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8621                     unsigned long *isa_flags)
8622 {
8623   char *ext;
8624   const struct processor *arch;
8625   char *str = (char *) alloca (strlen (to_parse) + 1);
8626   size_t len;
8627
8628   strcpy (str, to_parse);
8629
8630   ext = strchr (str, '+');
8631
8632   if (ext != NULL)
8633     len = ext - str;
8634   else
8635     len = strlen (str);
8636
8637   if (len == 0)
8638     return AARCH64_PARSE_MISSING_ARG;
8639
8640
8641   /* Loop through the list of supported ARCHes to find a match.  */
8642   for (arch = all_architectures; arch->name != NULL; arch++)
8643     {
8644       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8645         {
8646           unsigned long isa_temp = arch->flags;
8647
8648           if (ext != NULL)
8649             {
8650               /* TO_PARSE string contains at least one extension.  */
8651               enum aarch64_parse_opt_result ext_res
8652                 = aarch64_parse_extension (ext, &isa_temp);
8653
8654               if (ext_res != AARCH64_PARSE_OK)
8655                 return ext_res;
8656             }
8657           /* Extension parsing was successful.  Confirm the result
8658              arch and ISA flags.  */
8659           *res = arch;
8660           *isa_flags = isa_temp;
8661           return AARCH64_PARSE_OK;
8662         }
8663     }
8664
8665   /* ARCH name not found in list.  */
8666   return AARCH64_PARSE_INVALID_ARG;
8667 }
8668
8669 /* Parse the TO_PARSE string and put the result tuning in RES and the
8670    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8671    describing the parse result.  If there is an error parsing, RES and
8672    ISA_FLAGS are left unchanged.  */
8673
8674 static enum aarch64_parse_opt_result
8675 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8676                    unsigned long *isa_flags)
8677 {
8678   char *ext;
8679   const struct processor *cpu;
8680   char *str = (char *) alloca (strlen (to_parse) + 1);
8681   size_t len;
8682
8683   strcpy (str, to_parse);
8684
8685   ext = strchr (str, '+');
8686
8687   if (ext != NULL)
8688     len = ext - str;
8689   else
8690     len = strlen (str);
8691
8692   if (len == 0)
8693     return AARCH64_PARSE_MISSING_ARG;
8694
8695
8696   /* Loop through the list of supported CPUs to find a match.  */
8697   for (cpu = all_cores; cpu->name != NULL; cpu++)
8698     {
8699       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8700         {
8701           unsigned long isa_temp = cpu->flags;
8702
8703
8704           if (ext != NULL)
8705             {
8706               /* TO_PARSE string contains at least one extension.  */
8707               enum aarch64_parse_opt_result ext_res
8708                 = aarch64_parse_extension (ext, &isa_temp);
8709
8710               if (ext_res != AARCH64_PARSE_OK)
8711                 return ext_res;
8712             }
8713           /* Extension parsing was successfull.  Confirm the result
8714              cpu and ISA flags.  */
8715           *res = cpu;
8716           *isa_flags = isa_temp;
8717           return AARCH64_PARSE_OK;
8718         }
8719     }
8720
8721   /* CPU name not found in list.  */
8722   return AARCH64_PARSE_INVALID_ARG;
8723 }
8724
8725 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8726    Return an aarch64_parse_opt_result describing the parse result.
8727    If the parsing fails the RES does not change.  */
8728
8729 static enum aarch64_parse_opt_result
8730 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8731 {
8732   const struct processor *cpu;
8733   char *str = (char *) alloca (strlen (to_parse) + 1);
8734
8735   strcpy (str, to_parse);
8736
8737   /* Loop through the list of supported CPUs to find a match.  */
8738   for (cpu = all_cores; cpu->name != NULL; cpu++)
8739     {
8740       if (strcmp (cpu->name, str) == 0)
8741         {
8742           *res = cpu;
8743           return AARCH64_PARSE_OK;
8744         }
8745     }
8746
8747   /* CPU name not found in list.  */
8748   return AARCH64_PARSE_INVALID_ARG;
8749 }
8750
8751 /* Parse TOKEN, which has length LENGTH to see if it is an option
8752    described in FLAG.  If it is, return the index bit for that fusion type.
8753    If not, error (printing OPTION_NAME) and return zero.  */
8754
8755 static unsigned int
8756 aarch64_parse_one_option_token (const char *token,
8757                                 size_t length,
8758                                 const struct aarch64_flag_desc *flag,
8759                                 const char *option_name)
8760 {
8761   for (; flag->name != NULL; flag++)
8762     {
8763       if (length == strlen (flag->name)
8764           && !strncmp (flag->name, token, length))
8765         return flag->flag;
8766     }
8767
8768   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8769   return 0;
8770 }
8771
8772 /* Parse OPTION which is a comma-separated list of flags to enable.
8773    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8774    default state we inherit from the CPU tuning structures.  OPTION_NAME
8775    gives the top-level option we are parsing in the -moverride string,
8776    for use in error messages.  */
8777
8778 static unsigned int
8779 aarch64_parse_boolean_options (const char *option,
8780                                const struct aarch64_flag_desc *flags,
8781                                unsigned int initial_state,
8782                                const char *option_name)
8783 {
8784   const char separator = '.';
8785   const char* specs = option;
8786   const char* ntoken = option;
8787   unsigned int found_flags = initial_state;
8788
8789   while ((ntoken = strchr (specs, separator)))
8790     {
8791       size_t token_length = ntoken - specs;
8792       unsigned token_ops = aarch64_parse_one_option_token (specs,
8793                                                            token_length,
8794                                                            flags,
8795                                                            option_name);
8796       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8797          in the token stream, reset the supported operations.  So:
8798
8799            adrp+add.cmp+branch.none.adrp+add
8800
8801            would have the result of turning on only adrp+add fusion.  */
8802       if (!token_ops)
8803         found_flags = 0;
8804
8805       found_flags |= token_ops;
8806       specs = ++ntoken;
8807     }
8808
8809   /* We ended with a comma, print something.  */
8810   if (!(*specs))
8811     {
8812       error ("%s string ill-formed\n", option_name);
8813       return 0;
8814     }
8815
8816   /* We still have one more token to parse.  */
8817   size_t token_length = strlen (specs);
8818   unsigned token_ops = aarch64_parse_one_option_token (specs,
8819                                                        token_length,
8820                                                        flags,
8821                                                        option_name);
8822    if (!token_ops)
8823      found_flags = 0;
8824
8825   found_flags |= token_ops;
8826   return found_flags;
8827 }
8828
8829 /* Support for overriding instruction fusion.  */
8830
8831 static void
8832 aarch64_parse_fuse_string (const char *fuse_string,
8833                             struct tune_params *tune)
8834 {
8835   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8836                                                      aarch64_fusible_pairs,
8837                                                      tune->fusible_ops,
8838                                                      "fuse=");
8839 }
8840
8841 /* Support for overriding other tuning flags.  */
8842
8843 static void
8844 aarch64_parse_tune_string (const char *tune_string,
8845                             struct tune_params *tune)
8846 {
8847   tune->extra_tuning_flags
8848     = aarch64_parse_boolean_options (tune_string,
8849                                      aarch64_tuning_flags,
8850                                      tune->extra_tuning_flags,
8851                                      "tune=");
8852 }
8853
8854 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8855    we understand.  If it is, extract the option string and handoff to
8856    the appropriate function.  */
8857
8858 void
8859 aarch64_parse_one_override_token (const char* token,
8860                                   size_t length,
8861                                   struct tune_params *tune)
8862 {
8863   const struct aarch64_tuning_override_function *fn
8864     = aarch64_tuning_override_functions;
8865
8866   const char *option_part = strchr (token, '=');
8867   if (!option_part)
8868     {
8869       error ("tuning string missing in option (%s)", token);
8870       return;
8871     }
8872
8873   /* Get the length of the option name.  */
8874   length = option_part - token;
8875   /* Skip the '=' to get to the option string.  */
8876   option_part++;
8877
8878   for (; fn->name != NULL; fn++)
8879     {
8880       if (!strncmp (fn->name, token, length))
8881         {
8882           fn->parse_override (option_part, tune);
8883           return;
8884         }
8885     }
8886
8887   error ("unknown tuning option (%s)",token);
8888   return;
8889 }
8890
8891 /* A checking mechanism for the implementation of the tls size.  */
8892
8893 static void
8894 initialize_aarch64_tls_size (struct gcc_options *opts)
8895 {
8896   if (aarch64_tls_size == 0)
8897     aarch64_tls_size = 24;
8898
8899   switch (opts->x_aarch64_cmodel_var)
8900     {
8901     case AARCH64_CMODEL_TINY:
8902       /* Both the default and maximum TLS size allowed under tiny is 1M which
8903          needs two instructions to address, so we clamp the size to 24.  */
8904       if (aarch64_tls_size > 24)
8905         aarch64_tls_size = 24;
8906       break;
8907     case AARCH64_CMODEL_SMALL:
8908       /* The maximum TLS size allowed under small is 4G.  */
8909       if (aarch64_tls_size > 32)
8910         aarch64_tls_size = 32;
8911       break;
8912     case AARCH64_CMODEL_LARGE:
8913       /* The maximum TLS size allowed under large is 16E.
8914          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8915       if (aarch64_tls_size > 48)
8916         aarch64_tls_size = 48;
8917       break;
8918     default:
8919       gcc_unreachable ();
8920     }
8921
8922   return;
8923 }
8924
8925 /* Parse STRING looking for options in the format:
8926      string     :: option:string
8927      option     :: name=substring
8928      name       :: {a-z}
8929      substring  :: defined by option.  */
8930
8931 static void
8932 aarch64_parse_override_string (const char* input_string,
8933                                struct tune_params* tune)
8934 {
8935   const char separator = ':';
8936   size_t string_length = strlen (input_string) + 1;
8937   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8938   char *string = string_root;
8939   strncpy (string, input_string, string_length);
8940   string[string_length - 1] = '\0';
8941
8942   char* ntoken = string;
8943
8944   while ((ntoken = strchr (string, separator)))
8945     {
8946       size_t token_length = ntoken - string;
8947       /* Make this substring look like a string.  */
8948       *ntoken = '\0';
8949       aarch64_parse_one_override_token (string, token_length, tune);
8950       string = ++ntoken;
8951     }
8952
8953   /* One last option to parse.  */
8954   aarch64_parse_one_override_token (string, strlen (string), tune);
8955   free (string_root);
8956 }
8957
8958
8959 static void
8960 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8961 {
8962   /* The logic here is that if we are disabling all frame pointer generation
8963      then we do not need to disable leaf frame pointer generation as a
8964      separate operation.  But if we are *only* disabling leaf frame pointer
8965      generation then we set flag_omit_frame_pointer to true, but in
8966      aarch64_frame_pointer_required we return false only for leaf functions.
8967
8968      PR 70044: We have to be careful about being called multiple times for the
8969      same function.  Once we have decided to set flag_omit_frame_pointer just
8970      so that we can omit leaf frame pointers, we must then not interpret a
8971      second call as meaning that all frame pointer generation should be
8972      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8973      non-zero value.  */
8974   if (opts->x_flag_omit_frame_pointer == 2)
8975     opts->x_flag_omit_frame_pointer = 0;
8976
8977   if (opts->x_flag_omit_frame_pointer)
8978     opts->x_flag_omit_leaf_frame_pointer = false;
8979   else if (opts->x_flag_omit_leaf_frame_pointer)
8980     opts->x_flag_omit_frame_pointer = 2;
8981
8982   /* If not optimizing for size, set the default
8983      alignment to what the target wants.  */
8984   if (!opts->x_optimize_size)
8985     {
8986       if (opts->x_align_loops <= 0)
8987         opts->x_align_loops = aarch64_tune_params.loop_align;
8988       if (opts->x_align_jumps <= 0)
8989         opts->x_align_jumps = aarch64_tune_params.jump_align;
8990       if (opts->x_align_functions <= 0)
8991         opts->x_align_functions = aarch64_tune_params.function_align;
8992     }
8993
8994   /* We default to no pc-relative literal loads.  */
8995
8996   aarch64_pcrelative_literal_loads = false;
8997
8998   /* If -mpc-relative-literal-loads is set on the command line, this
8999      implies that the user asked for PC relative literal loads.  */
9000   if (opts->x_pcrelative_literal_loads == 1)
9001     aarch64_pcrelative_literal_loads = true;
9002
9003   /* In the tiny memory model it makes no sense to disallow PC relative
9004      literal pool loads.  */
9005   if (aarch64_cmodel == AARCH64_CMODEL_TINY
9006       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9007     aarch64_pcrelative_literal_loads = true;
9008
9009   /* When enabling the lower precision Newton series for the square root, also
9010      enable it for the reciprocal square root, since the latter is an
9011      intermediary step for the former.  */
9012   if (flag_mlow_precision_sqrt)
9013     flag_mrecip_low_precision_sqrt = true;
9014 }
9015
9016 /* 'Unpack' up the internal tuning structs and update the options
9017     in OPTS.  The caller must have set up selected_tune and selected_arch
9018     as all the other target-specific codegen decisions are
9019     derived from them.  */
9020
9021 void
9022 aarch64_override_options_internal (struct gcc_options *opts)
9023 {
9024   aarch64_tune_flags = selected_tune->flags;
9025   aarch64_tune = selected_tune->sched_core;
9026   /* Make a copy of the tuning parameters attached to the core, which
9027      we may later overwrite.  */
9028   aarch64_tune_params = *(selected_tune->tune);
9029   aarch64_architecture_version = selected_arch->architecture_version;
9030
9031   if (opts->x_aarch64_override_tune_string)
9032     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
9033                                   &aarch64_tune_params);
9034
9035   /* This target defaults to strict volatile bitfields.  */
9036   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
9037     opts->x_flag_strict_volatile_bitfields = 1;
9038
9039   initialize_aarch64_code_model (opts);
9040   initialize_aarch64_tls_size (opts);
9041
9042   int queue_depth = 0;
9043   switch (aarch64_tune_params.autoprefetcher_model)
9044     {
9045       case tune_params::AUTOPREFETCHER_OFF:
9046         queue_depth = -1;
9047         break;
9048       case tune_params::AUTOPREFETCHER_WEAK:
9049         queue_depth = 0;
9050         break;
9051       case tune_params::AUTOPREFETCHER_STRONG:
9052         queue_depth = max_insn_queue_index + 1;
9053         break;
9054       default:
9055         gcc_unreachable ();
9056     }
9057
9058   /* We don't mind passing in global_options_set here as we don't use
9059      the *options_set structs anyway.  */
9060   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9061                          queue_depth,
9062                          opts->x_param_values,
9063                          global_options_set.x_param_values);
9064
9065   /* Set up parameters to be used in prefetching algorithm.  Do not
9066      override the defaults unless we are tuning for a core we have
9067      researched values for.  */
9068   if (aarch64_tune_params.prefetch->num_slots > 0)
9069     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9070                            aarch64_tune_params.prefetch->num_slots,
9071                            opts->x_param_values,
9072                            global_options_set.x_param_values);
9073   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9074     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9075                            aarch64_tune_params.prefetch->l1_cache_size,
9076                            opts->x_param_values,
9077                            global_options_set.x_param_values);
9078   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9079     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9080                            aarch64_tune_params.prefetch->l1_cache_line_size,
9081                            opts->x_param_values,
9082                            global_options_set.x_param_values);
9083   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9084     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9085                            aarch64_tune_params.prefetch->l2_cache_size,
9086                            opts->x_param_values,
9087                            global_options_set.x_param_values);
9088
9089   /* Enable sw prefetching at specified optimization level for
9090      CPUS that have prefetch.  Lower optimization level threshold by 1
9091      when profiling is enabled.  */
9092   if (opts->x_flag_prefetch_loop_arrays < 0
9093       && !opts->x_optimize_size
9094       && aarch64_tune_params.prefetch->default_opt_level >= 0
9095       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9096     opts->x_flag_prefetch_loop_arrays = 1;
9097
9098   aarch64_override_options_after_change_1 (opts);
9099 }
9100
9101 /* Print a hint with a suggestion for a core or architecture name that
9102    most closely resembles what the user passed in STR.  ARCH is true if
9103    the user is asking for an architecture name.  ARCH is false if the user
9104    is asking for a core name.  */
9105
9106 static void
9107 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9108 {
9109   auto_vec<const char *> candidates;
9110   const struct processor *entry = arch ? all_architectures : all_cores;
9111   for (; entry->name != NULL; entry++)
9112     candidates.safe_push (entry->name);
9113   char *s;
9114   const char *hint = candidates_list_and_hint (str, s, candidates);
9115   if (hint)
9116     inform (input_location, "valid arguments are: %s;"
9117                              " did you mean %qs?", s, hint);
9118   XDELETEVEC (s);
9119 }
9120
9121 /* Print a hint with a suggestion for a core name that most closely resembles
9122    what the user passed in STR.  */
9123
9124 inline static void
9125 aarch64_print_hint_for_core (const char *str)
9126 {
9127   aarch64_print_hint_for_core_or_arch (str, false);
9128 }
9129
9130 /* Print a hint with a suggestion for an architecture name that most closely
9131    resembles what the user passed in STR.  */
9132
9133 inline static void
9134 aarch64_print_hint_for_arch (const char *str)
9135 {
9136   aarch64_print_hint_for_core_or_arch (str, true);
9137 }
9138
9139 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9140    specified in STR and throw errors if appropriate.  Put the results if
9141    they are valid in RES and ISA_FLAGS.  Return whether the option is
9142    valid.  */
9143
9144 static bool
9145 aarch64_validate_mcpu (const char *str, const struct processor **res,
9146                        unsigned long *isa_flags)
9147 {
9148   enum aarch64_parse_opt_result parse_res
9149     = aarch64_parse_cpu (str, res, isa_flags);
9150
9151   if (parse_res == AARCH64_PARSE_OK)
9152     return true;
9153
9154   switch (parse_res)
9155     {
9156       case AARCH64_PARSE_MISSING_ARG:
9157         error ("missing cpu name in %<-mcpu=%s%>", str);
9158         break;
9159       case AARCH64_PARSE_INVALID_ARG:
9160         error ("unknown value %qs for -mcpu", str);
9161         aarch64_print_hint_for_core (str);
9162         break;
9163       case AARCH64_PARSE_INVALID_FEATURE:
9164         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9165         break;
9166       default:
9167         gcc_unreachable ();
9168     }
9169
9170   return false;
9171 }
9172
9173 /* Validate a command-line -march option.  Parse the arch and extensions
9174    (if any) specified in STR and throw errors if appropriate.  Put the
9175    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9176    option is valid.  */
9177
9178 static bool
9179 aarch64_validate_march (const char *str, const struct processor **res,
9180                          unsigned long *isa_flags)
9181 {
9182   enum aarch64_parse_opt_result parse_res
9183     = aarch64_parse_arch (str, res, isa_flags);
9184
9185   if (parse_res == AARCH64_PARSE_OK)
9186     return true;
9187
9188   switch (parse_res)
9189     {
9190       case AARCH64_PARSE_MISSING_ARG:
9191         error ("missing arch name in %<-march=%s%>", str);
9192         break;
9193       case AARCH64_PARSE_INVALID_ARG:
9194         error ("unknown value %qs for -march", str);
9195         aarch64_print_hint_for_arch (str);
9196         break;
9197       case AARCH64_PARSE_INVALID_FEATURE:
9198         error ("invalid feature modifier in %<-march=%s%>", str);
9199         break;
9200       default:
9201         gcc_unreachable ();
9202     }
9203
9204   return false;
9205 }
9206
9207 /* Validate a command-line -mtune option.  Parse the cpu
9208    specified in STR and throw errors if appropriate.  Put the
9209    result, if it is valid, in RES.  Return whether the option is
9210    valid.  */
9211
9212 static bool
9213 aarch64_validate_mtune (const char *str, const struct processor **res)
9214 {
9215   enum aarch64_parse_opt_result parse_res
9216     = aarch64_parse_tune (str, res);
9217
9218   if (parse_res == AARCH64_PARSE_OK)
9219     return true;
9220
9221   switch (parse_res)
9222     {
9223       case AARCH64_PARSE_MISSING_ARG:
9224         error ("missing cpu name in %<-mtune=%s%>", str);
9225         break;
9226       case AARCH64_PARSE_INVALID_ARG:
9227         error ("unknown value %qs for -mtune", str);
9228         aarch64_print_hint_for_core (str);
9229         break;
9230       default:
9231         gcc_unreachable ();
9232     }
9233   return false;
9234 }
9235
9236 /* Return the CPU corresponding to the enum CPU.
9237    If it doesn't specify a cpu, return the default.  */
9238
9239 static const struct processor *
9240 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9241 {
9242   if (cpu != aarch64_none)
9243     return &all_cores[cpu];
9244
9245   /* The & 0x3f is to extract the bottom 6 bits that encode the
9246      default cpu as selected by the --with-cpu GCC configure option
9247      in config.gcc.
9248      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9249      flags mechanism should be reworked to make it more sane.  */
9250   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9251 }
9252
9253 /* Return the architecture corresponding to the enum ARCH.
9254    If it doesn't specify a valid architecture, return the default.  */
9255
9256 static const struct processor *
9257 aarch64_get_arch (enum aarch64_arch arch)
9258 {
9259   if (arch != aarch64_no_arch)
9260     return &all_architectures[arch];
9261
9262   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9263
9264   return &all_architectures[cpu->arch];
9265 }
9266
9267 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9268    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9269    tuning structs.  In particular it must set selected_tune and
9270    aarch64_isa_flags that define the available ISA features and tuning
9271    decisions.  It must also set selected_arch as this will be used to
9272    output the .arch asm tags for each function.  */
9273
9274 static void
9275 aarch64_override_options (void)
9276 {
9277   unsigned long cpu_isa = 0;
9278   unsigned long arch_isa = 0;
9279   aarch64_isa_flags = 0;
9280
9281   bool valid_cpu = true;
9282   bool valid_tune = true;
9283   bool valid_arch = true;
9284
9285   selected_cpu = NULL;
9286   selected_arch = NULL;
9287   selected_tune = NULL;
9288
9289   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9290      If either of -march or -mtune is given, they override their
9291      respective component of -mcpu.  */
9292   if (aarch64_cpu_string)
9293     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9294                                         &cpu_isa);
9295
9296   if (aarch64_arch_string)
9297     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9298                                           &arch_isa);
9299
9300   if (aarch64_tune_string)
9301     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9302
9303   /* If the user did not specify a processor, choose the default
9304      one for them.  This will be the CPU set during configuration using
9305      --with-cpu, otherwise it is "generic".  */
9306   if (!selected_cpu)
9307     {
9308       if (selected_arch)
9309         {
9310           selected_cpu = &all_cores[selected_arch->ident];
9311           aarch64_isa_flags = arch_isa;
9312           explicit_arch = selected_arch->arch;
9313         }
9314       else
9315         {
9316           /* Get default configure-time CPU.  */
9317           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9318           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9319         }
9320
9321       if (selected_tune)
9322         explicit_tune_core = selected_tune->ident;
9323     }
9324   /* If both -mcpu and -march are specified check that they are architecturally
9325      compatible, warn if they're not and prefer the -march ISA flags.  */
9326   else if (selected_arch)
9327     {
9328       if (selected_arch->arch != selected_cpu->arch)
9329         {
9330           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9331                        all_architectures[selected_cpu->arch].name,
9332                        selected_arch->name);
9333         }
9334       aarch64_isa_flags = arch_isa;
9335       explicit_arch = selected_arch->arch;
9336       explicit_tune_core = selected_tune ? selected_tune->ident
9337                                           : selected_cpu->ident;
9338     }
9339   else
9340     {
9341       /* -mcpu but no -march.  */
9342       aarch64_isa_flags = cpu_isa;
9343       explicit_tune_core = selected_tune ? selected_tune->ident
9344                                           : selected_cpu->ident;
9345       gcc_assert (selected_cpu);
9346       selected_arch = &all_architectures[selected_cpu->arch];
9347       explicit_arch = selected_arch->arch;
9348     }
9349
9350   /* Set the arch as well as we will need it when outputing
9351      the .arch directive in assembly.  */
9352   if (!selected_arch)
9353     {
9354       gcc_assert (selected_cpu);
9355       selected_arch = &all_architectures[selected_cpu->arch];
9356     }
9357
9358   if (!selected_tune)
9359     selected_tune = selected_cpu;
9360
9361 #ifndef HAVE_AS_MABI_OPTION
9362   /* The compiler may have been configured with 2.23.* binutils, which does
9363      not have support for ILP32.  */
9364   if (TARGET_ILP32)
9365     error ("Assembler does not support -mabi=ilp32");
9366 #endif
9367
9368   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9369     sorry ("Return address signing is only supported for -mabi=lp64");
9370
9371   /* Make sure we properly set up the explicit options.  */
9372   if ((aarch64_cpu_string && valid_cpu)
9373        || (aarch64_tune_string && valid_tune))
9374     gcc_assert (explicit_tune_core != aarch64_none);
9375
9376   if ((aarch64_cpu_string && valid_cpu)
9377        || (aarch64_arch_string && valid_arch))
9378     gcc_assert (explicit_arch != aarch64_no_arch);
9379
9380   aarch64_override_options_internal (&global_options);
9381
9382   /* Save these options as the default ones in case we push and pop them later
9383      while processing functions with potential target attributes.  */
9384   target_option_default_node = target_option_current_node
9385       = build_target_option_node (&global_options);
9386 }
9387
9388 /* Implement targetm.override_options_after_change.  */
9389
9390 static void
9391 aarch64_override_options_after_change (void)
9392 {
9393   aarch64_override_options_after_change_1 (&global_options);
9394 }
9395
9396 static struct machine_function *
9397 aarch64_init_machine_status (void)
9398 {
9399   struct machine_function *machine;
9400   machine = ggc_cleared_alloc<machine_function> ();
9401   return machine;
9402 }
9403
9404 void
9405 aarch64_init_expanders (void)
9406 {
9407   init_machine_status = aarch64_init_machine_status;
9408 }
9409
9410 /* A checking mechanism for the implementation of the various code models.  */
9411 static void
9412 initialize_aarch64_code_model (struct gcc_options *opts)
9413 {
9414    if (opts->x_flag_pic)
9415      {
9416        switch (opts->x_aarch64_cmodel_var)
9417          {
9418          case AARCH64_CMODEL_TINY:
9419            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9420            break;
9421          case AARCH64_CMODEL_SMALL:
9422 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9423            aarch64_cmodel = (flag_pic == 2
9424                              ? AARCH64_CMODEL_SMALL_PIC
9425                              : AARCH64_CMODEL_SMALL_SPIC);
9426 #else
9427            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9428 #endif
9429            break;
9430          case AARCH64_CMODEL_LARGE:
9431            sorry ("code model %qs with -f%s", "large",
9432                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9433            break;
9434          default:
9435            gcc_unreachable ();
9436          }
9437      }
9438    else
9439      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9440 }
9441
9442 /* Implement TARGET_OPTION_SAVE.  */
9443
9444 static void
9445 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9446 {
9447   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9448 }
9449
9450 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9451    using the information saved in PTR.  */
9452
9453 static void
9454 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9455 {
9456   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9457   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9458   opts->x_explicit_arch = ptr->x_explicit_arch;
9459   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9460   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9461
9462   aarch64_override_options_internal (opts);
9463 }
9464
9465 /* Implement TARGET_OPTION_PRINT.  */
9466
9467 static void
9468 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9469 {
9470   const struct processor *cpu
9471     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9472   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9473   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9474   std::string extension
9475     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9476
9477   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9478   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9479            arch->name, extension.c_str ());
9480 }
9481
9482 static GTY(()) tree aarch64_previous_fndecl;
9483
9484 void
9485 aarch64_reset_previous_fndecl (void)
9486 {
9487   aarch64_previous_fndecl = NULL;
9488 }
9489
9490 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9491    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9492    make sure optab availability predicates are recomputed when necessary.  */
9493
9494 void
9495 aarch64_save_restore_target_globals (tree new_tree)
9496 {
9497   if (TREE_TARGET_GLOBALS (new_tree))
9498     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9499   else if (new_tree == target_option_default_node)
9500     restore_target_globals (&default_target_globals);
9501   else
9502     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9503 }
9504
9505 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9506    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9507    of the function, if such exists.  This function may be called multiple
9508    times on a single function so use aarch64_previous_fndecl to avoid
9509    setting up identical state.  */
9510
9511 static void
9512 aarch64_set_current_function (tree fndecl)
9513 {
9514   if (!fndecl || fndecl == aarch64_previous_fndecl)
9515     return;
9516
9517   tree old_tree = (aarch64_previous_fndecl
9518                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9519                    : NULL_TREE);
9520
9521   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9522
9523   /* If current function has no attributes but the previous one did,
9524      use the default node.  */
9525   if (!new_tree && old_tree)
9526     new_tree = target_option_default_node;
9527
9528   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9529      the default have been handled by aarch64_save_restore_target_globals from
9530      aarch64_pragma_target_parse.  */
9531   if (old_tree == new_tree)
9532     return;
9533
9534   aarch64_previous_fndecl = fndecl;
9535
9536   /* First set the target options.  */
9537   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9538
9539   aarch64_save_restore_target_globals (new_tree);
9540 }
9541
9542 /* Enum describing the various ways we can handle attributes.
9543    In many cases we can reuse the generic option handling machinery.  */
9544
9545 enum aarch64_attr_opt_type
9546 {
9547   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9548   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9549   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9550   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9551 };
9552
9553 /* All the information needed to handle a target attribute.
9554    NAME is the name of the attribute.
9555    ATTR_TYPE specifies the type of behavior of the attribute as described
9556    in the definition of enum aarch64_attr_opt_type.
9557    ALLOW_NEG is true if the attribute supports a "no-" form.
9558    HANDLER is the function that takes the attribute string and whether
9559    it is a pragma or attribute and handles the option.  It is needed only
9560    when the ATTR_TYPE is aarch64_attr_custom.
9561    OPT_NUM is the enum specifying the option that the attribute modifies.
9562    This is needed for attributes that mirror the behavior of a command-line
9563    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9564    aarch64_attr_enum.  */
9565
9566 struct aarch64_attribute_info
9567 {
9568   const char *name;
9569   enum aarch64_attr_opt_type attr_type;
9570   bool allow_neg;
9571   bool (*handler) (const char *, const char *);
9572   enum opt_code opt_num;
9573 };
9574
9575 /* Handle the ARCH_STR argument to the arch= target attribute.
9576    PRAGMA_OR_ATTR is used in potential error messages.  */
9577
9578 static bool
9579 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9580 {
9581   const struct processor *tmp_arch = NULL;
9582   enum aarch64_parse_opt_result parse_res
9583     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9584
9585   if (parse_res == AARCH64_PARSE_OK)
9586     {
9587       gcc_assert (tmp_arch);
9588       selected_arch = tmp_arch;
9589       explicit_arch = selected_arch->arch;
9590       return true;
9591     }
9592
9593   switch (parse_res)
9594     {
9595       case AARCH64_PARSE_MISSING_ARG:
9596         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9597         break;
9598       case AARCH64_PARSE_INVALID_ARG:
9599         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9600         aarch64_print_hint_for_arch (str);
9601         break;
9602       case AARCH64_PARSE_INVALID_FEATURE:
9603         error ("invalid feature modifier %qs for 'arch' target %s",
9604                str, pragma_or_attr);
9605         break;
9606       default:
9607         gcc_unreachable ();
9608     }
9609
9610   return false;
9611 }
9612
9613 /* Handle the argument CPU_STR to the cpu= target attribute.
9614    PRAGMA_OR_ATTR is used in potential error messages.  */
9615
9616 static bool
9617 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9618 {
9619   const struct processor *tmp_cpu = NULL;
9620   enum aarch64_parse_opt_result parse_res
9621     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9622
9623   if (parse_res == AARCH64_PARSE_OK)
9624     {
9625       gcc_assert (tmp_cpu);
9626       selected_tune = tmp_cpu;
9627       explicit_tune_core = selected_tune->ident;
9628
9629       selected_arch = &all_architectures[tmp_cpu->arch];
9630       explicit_arch = selected_arch->arch;
9631       return true;
9632     }
9633
9634   switch (parse_res)
9635     {
9636       case AARCH64_PARSE_MISSING_ARG:
9637         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9638         break;
9639       case AARCH64_PARSE_INVALID_ARG:
9640         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9641         aarch64_print_hint_for_core (str);
9642         break;
9643       case AARCH64_PARSE_INVALID_FEATURE:
9644         error ("invalid feature modifier %qs for 'cpu' target %s",
9645                str, pragma_or_attr);
9646         break;
9647       default:
9648         gcc_unreachable ();
9649     }
9650
9651   return false;
9652 }
9653
9654 /* Handle the argument STR to the tune= target attribute.
9655    PRAGMA_OR_ATTR is used in potential error messages.  */
9656
9657 static bool
9658 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9659 {
9660   const struct processor *tmp_tune = NULL;
9661   enum aarch64_parse_opt_result parse_res
9662     = aarch64_parse_tune (str, &tmp_tune);
9663
9664   if (parse_res == AARCH64_PARSE_OK)
9665     {
9666       gcc_assert (tmp_tune);
9667       selected_tune = tmp_tune;
9668       explicit_tune_core = selected_tune->ident;
9669       return true;
9670     }
9671
9672   switch (parse_res)
9673     {
9674       case AARCH64_PARSE_INVALID_ARG:
9675         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9676         aarch64_print_hint_for_core (str);
9677         break;
9678       default:
9679         gcc_unreachable ();
9680     }
9681
9682   return false;
9683 }
9684
9685 /* Parse an architecture extensions target attribute string specified in STR.
9686    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9687    if successful.  Update aarch64_isa_flags to reflect the ISA features
9688    modified.
9689    PRAGMA_OR_ATTR is used in potential error messages.  */
9690
9691 static bool
9692 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9693 {
9694   enum aarch64_parse_opt_result parse_res;
9695   unsigned long isa_flags = aarch64_isa_flags;
9696
9697   /* We allow "+nothing" in the beginning to clear out all architectural
9698      features if the user wants to handpick specific features.  */
9699   if (strncmp ("+nothing", str, 8) == 0)
9700     {
9701       isa_flags = 0;
9702       str += 8;
9703     }
9704
9705   parse_res = aarch64_parse_extension (str, &isa_flags);
9706
9707   if (parse_res == AARCH64_PARSE_OK)
9708     {
9709       aarch64_isa_flags = isa_flags;
9710       return true;
9711     }
9712
9713   switch (parse_res)
9714     {
9715       case AARCH64_PARSE_MISSING_ARG:
9716         error ("missing feature modifier in target %s %qs",
9717                pragma_or_attr, str);
9718         break;
9719
9720       case AARCH64_PARSE_INVALID_FEATURE:
9721         error ("invalid feature modifier in target %s %qs",
9722                pragma_or_attr, str);
9723         break;
9724
9725       default:
9726         gcc_unreachable ();
9727     }
9728
9729  return false;
9730 }
9731
9732 /* The target attributes that we support.  On top of these we also support just
9733    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9734    handled explicitly in aarch64_process_one_target_attr.  */
9735
9736 static const struct aarch64_attribute_info aarch64_attributes[] =
9737 {
9738   { "general-regs-only", aarch64_attr_mask, false, NULL,
9739      OPT_mgeneral_regs_only },
9740   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9741      OPT_mfix_cortex_a53_835769 },
9742   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9743      OPT_mfix_cortex_a53_843419 },
9744   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9745   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9746   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9747      OPT_momit_leaf_frame_pointer },
9748   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9749   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9750      OPT_march_ },
9751   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9752   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9753      OPT_mtune_ },
9754   { "sign-return-address", aarch64_attr_enum, false, NULL,
9755      OPT_msign_return_address_ },
9756   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9757 };
9758
9759 /* Parse ARG_STR which contains the definition of one target attribute.
9760    Show appropriate errors if any or return true if the attribute is valid.
9761    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9762    we're processing a target attribute or pragma.  */
9763
9764 static bool
9765 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9766 {
9767   bool invert = false;
9768
9769   size_t len = strlen (arg_str);
9770
9771   if (len == 0)
9772     {
9773       error ("malformed target %s", pragma_or_attr);
9774       return false;
9775     }
9776
9777   char *str_to_check = (char *) alloca (len + 1);
9778   strcpy (str_to_check, arg_str);
9779
9780   /* Skip leading whitespace.  */
9781   while (*str_to_check == ' ' || *str_to_check == '\t')
9782     str_to_check++;
9783
9784   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9785      It is easier to detect and handle it explicitly here rather than going
9786      through the machinery for the rest of the target attributes in this
9787      function.  */
9788   if (*str_to_check == '+')
9789     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9790
9791   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9792     {
9793       invert = true;
9794       str_to_check += 3;
9795     }
9796   char *arg = strchr (str_to_check, '=');
9797
9798   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9799      and point ARG to "foo".  */
9800   if (arg)
9801     {
9802       *arg = '\0';
9803       arg++;
9804     }
9805   const struct aarch64_attribute_info *p_attr;
9806   bool found = false;
9807   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9808     {
9809       /* If the names don't match up, or the user has given an argument
9810          to an attribute that doesn't accept one, or didn't give an argument
9811          to an attribute that expects one, fail to match.  */
9812       if (strcmp (str_to_check, p_attr->name) != 0)
9813         continue;
9814
9815       found = true;
9816       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9817                               || p_attr->attr_type == aarch64_attr_enum;
9818
9819       if (attr_need_arg_p ^ (arg != NULL))
9820         {
9821           error ("target %s %qs does not accept an argument",
9822                   pragma_or_attr, str_to_check);
9823           return false;
9824         }
9825
9826       /* If the name matches but the attribute does not allow "no-" versions
9827          then we can't match.  */
9828       if (invert && !p_attr->allow_neg)
9829         {
9830           error ("target %s %qs does not allow a negated form",
9831                   pragma_or_attr, str_to_check);
9832           return false;
9833         }
9834
9835       switch (p_attr->attr_type)
9836         {
9837         /* Has a custom handler registered.
9838            For example, cpu=, arch=, tune=.  */
9839           case aarch64_attr_custom:
9840             gcc_assert (p_attr->handler);
9841             if (!p_attr->handler (arg, pragma_or_attr))
9842               return false;
9843             break;
9844
9845           /* Either set or unset a boolean option.  */
9846           case aarch64_attr_bool:
9847             {
9848               struct cl_decoded_option decoded;
9849
9850               generate_option (p_attr->opt_num, NULL, !invert,
9851                                CL_TARGET, &decoded);
9852               aarch64_handle_option (&global_options, &global_options_set,
9853                                       &decoded, input_location);
9854               break;
9855             }
9856           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9857              should know what mask to apply given the option number.  */
9858           case aarch64_attr_mask:
9859             {
9860               struct cl_decoded_option decoded;
9861               /* We only need to specify the option number.
9862                  aarch64_handle_option will know which mask to apply.  */
9863               decoded.opt_index = p_attr->opt_num;
9864               decoded.value = !invert;
9865               aarch64_handle_option (&global_options, &global_options_set,
9866                                       &decoded, input_location);
9867               break;
9868             }
9869           /* Use the option setting machinery to set an option to an enum.  */
9870           case aarch64_attr_enum:
9871             {
9872               gcc_assert (arg);
9873               bool valid;
9874               int value;
9875               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9876                                               &value, CL_TARGET);
9877               if (valid)
9878                 {
9879                   set_option (&global_options, NULL, p_attr->opt_num, value,
9880                               NULL, DK_UNSPECIFIED, input_location,
9881                               global_dc);
9882                 }
9883               else
9884                 {
9885                   error ("target %s %s=%s is not valid",
9886                          pragma_or_attr, str_to_check, arg);
9887                 }
9888               break;
9889             }
9890           default:
9891             gcc_unreachable ();
9892         }
9893     }
9894
9895   /* If we reached here we either have found an attribute and validated
9896      it or didn't match any.  If we matched an attribute but its arguments
9897      were malformed we will have returned false already.  */
9898   return found;
9899 }
9900
9901 /* Count how many times the character C appears in
9902    NULL-terminated string STR.  */
9903
9904 static unsigned int
9905 num_occurences_in_str (char c, char *str)
9906 {
9907   unsigned int res = 0;
9908   while (*str != '\0')
9909     {
9910       if (*str == c)
9911         res++;
9912
9913       str++;
9914     }
9915
9916   return res;
9917 }
9918
9919 /* Parse the tree in ARGS that contains the target attribute information
9920    and update the global target options space.  PRAGMA_OR_ATTR is a string
9921    to be used in error messages, specifying whether this is processing
9922    a target attribute or a target pragma.  */
9923
9924 bool
9925 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9926 {
9927   if (TREE_CODE (args) == TREE_LIST)
9928     {
9929       do
9930         {
9931           tree head = TREE_VALUE (args);
9932           if (head)
9933             {
9934               if (!aarch64_process_target_attr (head, pragma_or_attr))
9935                 return false;
9936             }
9937           args = TREE_CHAIN (args);
9938         } while (args);
9939
9940       return true;
9941     }
9942
9943   if (TREE_CODE (args) != STRING_CST)
9944     {
9945       error ("attribute %<target%> argument not a string");
9946       return false;
9947     }
9948
9949   size_t len = strlen (TREE_STRING_POINTER (args));
9950   char *str_to_check = (char *) alloca (len + 1);
9951   strcpy (str_to_check, TREE_STRING_POINTER (args));
9952
9953   if (len == 0)
9954     {
9955       error ("malformed target %s value", pragma_or_attr);
9956       return false;
9957     }
9958
9959   /* Used to catch empty spaces between commas i.e.
9960      attribute ((target ("attr1,,attr2"))).  */
9961   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9962
9963   /* Handle multiple target attributes separated by ','.  */
9964   char *token = strtok (str_to_check, ",");
9965
9966   unsigned int num_attrs = 0;
9967   while (token)
9968     {
9969       num_attrs++;
9970       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9971         {
9972           error ("target %s %qs is invalid", pragma_or_attr, token);
9973           return false;
9974         }
9975
9976       token = strtok (NULL, ",");
9977     }
9978
9979   if (num_attrs != num_commas + 1)
9980     {
9981       error ("malformed target %s list %qs",
9982               pragma_or_attr, TREE_STRING_POINTER (args));
9983       return false;
9984     }
9985
9986   return true;
9987 }
9988
9989 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9990    process attribute ((target ("..."))).  */
9991
9992 static bool
9993 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9994 {
9995   struct cl_target_option cur_target;
9996   bool ret;
9997   tree old_optimize;
9998   tree new_target, new_optimize;
9999   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10000
10001   /* If what we're processing is the current pragma string then the
10002      target option node is already stored in target_option_current_node
10003      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
10004      having to re-parse the string.  This is especially useful to keep
10005      arm_neon.h compile times down since that header contains a lot
10006      of intrinsics enclosed in pragmas.  */
10007   if (!existing_target && args == current_target_pragma)
10008     {
10009       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
10010       return true;
10011     }
10012   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10013
10014   old_optimize = build_optimization_node (&global_options);
10015   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
10016
10017   /* If the function changed the optimization levels as well as setting
10018      target options, start with the optimizations specified.  */
10019   if (func_optimize && func_optimize != old_optimize)
10020     cl_optimization_restore (&global_options,
10021                              TREE_OPTIMIZATION (func_optimize));
10022
10023   /* Save the current target options to restore at the end.  */
10024   cl_target_option_save (&cur_target, &global_options);
10025
10026   /* If fndecl already has some target attributes applied to it, unpack
10027      them so that we add this attribute on top of them, rather than
10028      overwriting them.  */
10029   if (existing_target)
10030     {
10031       struct cl_target_option *existing_options
10032         = TREE_TARGET_OPTION (existing_target);
10033
10034       if (existing_options)
10035         cl_target_option_restore (&global_options, existing_options);
10036     }
10037   else
10038     cl_target_option_restore (&global_options,
10039                         TREE_TARGET_OPTION (target_option_current_node));
10040
10041
10042   ret = aarch64_process_target_attr (args, "attribute");
10043
10044   /* Set up any additional state.  */
10045   if (ret)
10046     {
10047       aarch64_override_options_internal (&global_options);
10048       /* Initialize SIMD builtins if we haven't already.
10049          Set current_target_pragma to NULL for the duration so that
10050          the builtin initialization code doesn't try to tag the functions
10051          being built with the attributes specified by any current pragma, thus
10052          going into an infinite recursion.  */
10053       if (TARGET_SIMD)
10054         {
10055           tree saved_current_target_pragma = current_target_pragma;
10056           current_target_pragma = NULL;
10057           aarch64_init_simd_builtins ();
10058           current_target_pragma = saved_current_target_pragma;
10059         }
10060       new_target = build_target_option_node (&global_options);
10061     }
10062   else
10063     new_target = NULL;
10064
10065   new_optimize = build_optimization_node (&global_options);
10066
10067   if (fndecl && ret)
10068     {
10069       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10070
10071       if (old_optimize != new_optimize)
10072         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10073     }
10074
10075   cl_target_option_restore (&global_options, &cur_target);
10076
10077   if (old_optimize != new_optimize)
10078     cl_optimization_restore (&global_options,
10079                              TREE_OPTIMIZATION (old_optimize));
10080   return ret;
10081 }
10082
10083 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10084    tri-bool options (yes, no, don't care) and the default value is
10085    DEF, determine whether to reject inlining.  */
10086
10087 static bool
10088 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10089                                      int dont_care, int def)
10090 {
10091   /* If the callee doesn't care, always allow inlining.  */
10092   if (callee == dont_care)
10093     return true;
10094
10095   /* If the caller doesn't care, always allow inlining.  */
10096   if (caller == dont_care)
10097     return true;
10098
10099   /* Otherwise, allow inlining if either the callee and caller values
10100      agree, or if the callee is using the default value.  */
10101   return (callee == caller || callee == def);
10102 }
10103
10104 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10105    to inline CALLEE into CALLER based on target-specific info.
10106    Make sure that the caller and callee have compatible architectural
10107    features.  Then go through the other possible target attributes
10108    and see if they can block inlining.  Try not to reject always_inline
10109    callees unless they are incompatible architecturally.  */
10110
10111 static bool
10112 aarch64_can_inline_p (tree caller, tree callee)
10113 {
10114   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10115   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10116
10117   /* If callee has no option attributes, then it is ok to inline.  */
10118   if (!callee_tree)
10119     return true;
10120
10121   struct cl_target_option *caller_opts
10122         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10123                                            : target_option_default_node);
10124
10125   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10126
10127
10128   /* Callee's ISA flags should be a subset of the caller's.  */
10129   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10130        != callee_opts->x_aarch64_isa_flags)
10131     return false;
10132
10133   /* Allow non-strict aligned functions inlining into strict
10134      aligned ones.  */
10135   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10136        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10137       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10138            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10139     return false;
10140
10141   bool always_inline = lookup_attribute ("always_inline",
10142                                           DECL_ATTRIBUTES (callee));
10143
10144   /* If the architectural features match up and the callee is always_inline
10145      then the other attributes don't matter.  */
10146   if (always_inline)
10147     return true;
10148
10149   if (caller_opts->x_aarch64_cmodel_var
10150       != callee_opts->x_aarch64_cmodel_var)
10151     return false;
10152
10153   if (caller_opts->x_aarch64_tls_dialect
10154       != callee_opts->x_aarch64_tls_dialect)
10155     return false;
10156
10157   /* Honour explicit requests to workaround errata.  */
10158   if (!aarch64_tribools_ok_for_inlining_p (
10159           caller_opts->x_aarch64_fix_a53_err835769,
10160           callee_opts->x_aarch64_fix_a53_err835769,
10161           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10162     return false;
10163
10164   if (!aarch64_tribools_ok_for_inlining_p (
10165           caller_opts->x_aarch64_fix_a53_err843419,
10166           callee_opts->x_aarch64_fix_a53_err843419,
10167           2, TARGET_FIX_ERR_A53_843419))
10168     return false;
10169
10170   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10171      caller and calle and they don't match up, reject inlining.  */
10172   if (!aarch64_tribools_ok_for_inlining_p (
10173           caller_opts->x_flag_omit_leaf_frame_pointer,
10174           callee_opts->x_flag_omit_leaf_frame_pointer,
10175           2, 1))
10176     return false;
10177
10178   /* If the callee has specific tuning overrides, respect them.  */
10179   if (callee_opts->x_aarch64_override_tune_string != NULL
10180       && caller_opts->x_aarch64_override_tune_string == NULL)
10181     return false;
10182
10183   /* If the user specified tuning override strings for the
10184      caller and callee and they don't match up, reject inlining.
10185      We just do a string compare here, we don't analyze the meaning
10186      of the string, as it would be too costly for little gain.  */
10187   if (callee_opts->x_aarch64_override_tune_string
10188       && caller_opts->x_aarch64_override_tune_string
10189       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10190                   caller_opts->x_aarch64_override_tune_string) != 0))
10191     return false;
10192
10193   return true;
10194 }
10195
10196 /* Return true if SYMBOL_REF X binds locally.  */
10197
10198 static bool
10199 aarch64_symbol_binds_local_p (const_rtx x)
10200 {
10201   return (SYMBOL_REF_DECL (x)
10202           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10203           : SYMBOL_REF_LOCAL_P (x));
10204 }
10205
10206 /* Return true if SYMBOL_REF X is thread local */
10207 static bool
10208 aarch64_tls_symbol_p (rtx x)
10209 {
10210   if (! TARGET_HAVE_TLS)
10211     return false;
10212
10213   if (GET_CODE (x) != SYMBOL_REF)
10214     return false;
10215
10216   return SYMBOL_REF_TLS_MODEL (x) != 0;
10217 }
10218
10219 /* Classify a TLS symbol into one of the TLS kinds.  */
10220 enum aarch64_symbol_type
10221 aarch64_classify_tls_symbol (rtx x)
10222 {
10223   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10224
10225   switch (tls_kind)
10226     {
10227     case TLS_MODEL_GLOBAL_DYNAMIC:
10228     case TLS_MODEL_LOCAL_DYNAMIC:
10229       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10230
10231     case TLS_MODEL_INITIAL_EXEC:
10232       switch (aarch64_cmodel)
10233         {
10234         case AARCH64_CMODEL_TINY:
10235         case AARCH64_CMODEL_TINY_PIC:
10236           return SYMBOL_TINY_TLSIE;
10237         default:
10238           return SYMBOL_SMALL_TLSIE;
10239         }
10240
10241     case TLS_MODEL_LOCAL_EXEC:
10242       if (aarch64_tls_size == 12)
10243         return SYMBOL_TLSLE12;
10244       else if (aarch64_tls_size == 24)
10245         return SYMBOL_TLSLE24;
10246       else if (aarch64_tls_size == 32)
10247         return SYMBOL_TLSLE32;
10248       else if (aarch64_tls_size == 48)
10249         return SYMBOL_TLSLE48;
10250       else
10251         gcc_unreachable ();
10252
10253     case TLS_MODEL_EMULATED:
10254     case TLS_MODEL_NONE:
10255       return SYMBOL_FORCE_TO_MEM;
10256
10257     default:
10258       gcc_unreachable ();
10259     }
10260 }
10261
10262 /* Return the method that should be used to access SYMBOL_REF or
10263    LABEL_REF X.  */
10264
10265 enum aarch64_symbol_type
10266 aarch64_classify_symbol (rtx x, rtx offset)
10267 {
10268   if (GET_CODE (x) == LABEL_REF)
10269     {
10270       switch (aarch64_cmodel)
10271         {
10272         case AARCH64_CMODEL_LARGE:
10273           return SYMBOL_FORCE_TO_MEM;
10274
10275         case AARCH64_CMODEL_TINY_PIC:
10276         case AARCH64_CMODEL_TINY:
10277           return SYMBOL_TINY_ABSOLUTE;
10278
10279         case AARCH64_CMODEL_SMALL_SPIC:
10280         case AARCH64_CMODEL_SMALL_PIC:
10281         case AARCH64_CMODEL_SMALL:
10282           return SYMBOL_SMALL_ABSOLUTE;
10283
10284         default:
10285           gcc_unreachable ();
10286         }
10287     }
10288
10289   if (GET_CODE (x) == SYMBOL_REF)
10290     {
10291       if (aarch64_tls_symbol_p (x))
10292         return aarch64_classify_tls_symbol (x);
10293
10294       switch (aarch64_cmodel)
10295         {
10296         case AARCH64_CMODEL_TINY:
10297           /* When we retrieve symbol + offset address, we have to make sure
10298              the offset does not cause overflow of the final address.  But
10299              we have no way of knowing the address of symbol at compile time
10300              so we can't accurately say if the distance between the PC and
10301              symbol + offset is outside the addressible range of +/-1M in the
10302              TINY code model.  So we rely on images not being greater than
10303              1M and cap the offset at 1M and anything beyond 1M will have to
10304              be loaded using an alternative mechanism.  Furthermore if the
10305              symbol is a weak reference to something that isn't known to
10306              resolve to a symbol in this module, then force to memory.  */
10307           if ((SYMBOL_REF_WEAK (x)
10308                && !aarch64_symbol_binds_local_p (x))
10309               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10310             return SYMBOL_FORCE_TO_MEM;
10311           return SYMBOL_TINY_ABSOLUTE;
10312
10313         case AARCH64_CMODEL_SMALL:
10314           /* Same reasoning as the tiny code model, but the offset cap here is
10315              4G.  */
10316           if ((SYMBOL_REF_WEAK (x)
10317                && !aarch64_symbol_binds_local_p (x))
10318               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10319                             HOST_WIDE_INT_C (4294967264)))
10320             return SYMBOL_FORCE_TO_MEM;
10321           return SYMBOL_SMALL_ABSOLUTE;
10322
10323         case AARCH64_CMODEL_TINY_PIC:
10324           if (!aarch64_symbol_binds_local_p (x))
10325             return SYMBOL_TINY_GOT;
10326           return SYMBOL_TINY_ABSOLUTE;
10327
10328         case AARCH64_CMODEL_SMALL_SPIC:
10329         case AARCH64_CMODEL_SMALL_PIC:
10330           if (!aarch64_symbol_binds_local_p (x))
10331             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10332                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10333           return SYMBOL_SMALL_ABSOLUTE;
10334
10335         case AARCH64_CMODEL_LARGE:
10336           /* This is alright even in PIC code as the constant
10337              pool reference is always PC relative and within
10338              the same translation unit.  */
10339           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10340             return SYMBOL_SMALL_ABSOLUTE;
10341           else
10342             return SYMBOL_FORCE_TO_MEM;
10343
10344         default:
10345           gcc_unreachable ();
10346         }
10347     }
10348
10349   /* By default push everything into the constant pool.  */
10350   return SYMBOL_FORCE_TO_MEM;
10351 }
10352
10353 bool
10354 aarch64_constant_address_p (rtx x)
10355 {
10356   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10357 }
10358
10359 bool
10360 aarch64_legitimate_pic_operand_p (rtx x)
10361 {
10362   if (GET_CODE (x) == SYMBOL_REF
10363       || (GET_CODE (x) == CONST
10364           && GET_CODE (XEXP (x, 0)) == PLUS
10365           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10366      return false;
10367
10368   return true;
10369 }
10370
10371 /* Return true if X holds either a quarter-precision or
10372      floating-point +0.0 constant.  */
10373 static bool
10374 aarch64_valid_floating_const (rtx x)
10375 {
10376   if (!CONST_DOUBLE_P (x))
10377     return false;
10378
10379   /* This call determines which constants can be used in mov<mode>
10380      as integer moves instead of constant loads.  */
10381   if (aarch64_float_const_rtx_p (x))
10382     return true;
10383
10384   return aarch64_float_const_representable_p (x);
10385 }
10386
10387 static bool
10388 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10389 {
10390   /* Do not allow vector struct mode constants.  We could support
10391      0 and -1 easily, but they need support in aarch64-simd.md.  */
10392   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10393     return false;
10394
10395   /* For these cases we never want to use a literal load.
10396      As such we have to prevent the compiler from forcing these
10397      to memory.  */
10398   if ((GET_CODE (x) == CONST_VECTOR
10399        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10400       || CONST_INT_P (x)
10401       || aarch64_valid_floating_const (x)
10402       || aarch64_can_const_movi_rtx_p (x, mode)
10403       || aarch64_float_const_rtx_p (x))
10404         return !targetm.cannot_force_const_mem (mode, x);
10405
10406   if (GET_CODE (x) == HIGH
10407       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10408     return true;
10409
10410   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10411      so spilling them is better than rematerialization.  */
10412   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10413     return true;
10414
10415   return aarch64_constant_address_p (x);
10416 }
10417
10418 rtx
10419 aarch64_load_tp (rtx target)
10420 {
10421   if (!target
10422       || GET_MODE (target) != Pmode
10423       || !register_operand (target, Pmode))
10424     target = gen_reg_rtx (Pmode);
10425
10426   /* Can return in any reg.  */
10427   emit_insn (gen_aarch64_load_tp_hard (target));
10428   return target;
10429 }
10430
10431 /* On AAPCS systems, this is the "struct __va_list".  */
10432 static GTY(()) tree va_list_type;
10433
10434 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10435    Return the type to use as __builtin_va_list.
10436
10437    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10438
10439    struct __va_list
10440    {
10441      void *__stack;
10442      void *__gr_top;
10443      void *__vr_top;
10444      int   __gr_offs;
10445      int   __vr_offs;
10446    };  */
10447
10448 static tree
10449 aarch64_build_builtin_va_list (void)
10450 {
10451   tree va_list_name;
10452   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10453
10454   /* Create the type.  */
10455   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10456   /* Give it the required name.  */
10457   va_list_name = build_decl (BUILTINS_LOCATION,
10458                              TYPE_DECL,
10459                              get_identifier ("__va_list"),
10460                              va_list_type);
10461   DECL_ARTIFICIAL (va_list_name) = 1;
10462   TYPE_NAME (va_list_type) = va_list_name;
10463   TYPE_STUB_DECL (va_list_type) = va_list_name;
10464
10465   /* Create the fields.  */
10466   f_stack = build_decl (BUILTINS_LOCATION,
10467                         FIELD_DECL, get_identifier ("__stack"),
10468                         ptr_type_node);
10469   f_grtop = build_decl (BUILTINS_LOCATION,
10470                         FIELD_DECL, get_identifier ("__gr_top"),
10471                         ptr_type_node);
10472   f_vrtop = build_decl (BUILTINS_LOCATION,
10473                         FIELD_DECL, get_identifier ("__vr_top"),
10474                         ptr_type_node);
10475   f_groff = build_decl (BUILTINS_LOCATION,
10476                         FIELD_DECL, get_identifier ("__gr_offs"),
10477                         integer_type_node);
10478   f_vroff = build_decl (BUILTINS_LOCATION,
10479                         FIELD_DECL, get_identifier ("__vr_offs"),
10480                         integer_type_node);
10481
10482   /* Tell tree-stdarg pass about our internal offset fields.
10483      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10484      purpose to identify whether the code is updating va_list internal
10485      offset fields through irregular way.  */
10486   va_list_gpr_counter_field = f_groff;
10487   va_list_fpr_counter_field = f_vroff;
10488
10489   DECL_ARTIFICIAL (f_stack) = 1;
10490   DECL_ARTIFICIAL (f_grtop) = 1;
10491   DECL_ARTIFICIAL (f_vrtop) = 1;
10492   DECL_ARTIFICIAL (f_groff) = 1;
10493   DECL_ARTIFICIAL (f_vroff) = 1;
10494
10495   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10496   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10497   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10498   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10499   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10500
10501   TYPE_FIELDS (va_list_type) = f_stack;
10502   DECL_CHAIN (f_stack) = f_grtop;
10503   DECL_CHAIN (f_grtop) = f_vrtop;
10504   DECL_CHAIN (f_vrtop) = f_groff;
10505   DECL_CHAIN (f_groff) = f_vroff;
10506
10507   /* Compute its layout.  */
10508   layout_type (va_list_type);
10509
10510   return va_list_type;
10511 }
10512
10513 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10514 static void
10515 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10516 {
10517   const CUMULATIVE_ARGS *cum;
10518   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10519   tree stack, grtop, vrtop, groff, vroff;
10520   tree t;
10521   int gr_save_area_size = cfun->va_list_gpr_size;
10522   int vr_save_area_size = cfun->va_list_fpr_size;
10523   int vr_offset;
10524
10525   cum = &crtl->args.info;
10526   if (cfun->va_list_gpr_size)
10527     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10528                              cfun->va_list_gpr_size);
10529   if (cfun->va_list_fpr_size)
10530     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10531                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10532
10533   if (!TARGET_FLOAT)
10534     {
10535       gcc_assert (cum->aapcs_nvrn == 0);
10536       vr_save_area_size = 0;
10537     }
10538
10539   f_stack = TYPE_FIELDS (va_list_type_node);
10540   f_grtop = DECL_CHAIN (f_stack);
10541   f_vrtop = DECL_CHAIN (f_grtop);
10542   f_groff = DECL_CHAIN (f_vrtop);
10543   f_vroff = DECL_CHAIN (f_groff);
10544
10545   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10546                   NULL_TREE);
10547   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10548                   NULL_TREE);
10549   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10550                   NULL_TREE);
10551   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10552                   NULL_TREE);
10553   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10554                   NULL_TREE);
10555
10556   /* Emit code to initialize STACK, which points to the next varargs stack
10557      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10558      by named arguments.  STACK is 8-byte aligned.  */
10559   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10560   if (cum->aapcs_stack_size > 0)
10561     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10562   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10563   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10564
10565   /* Emit code to initialize GRTOP, the top of the GR save area.
10566      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10567   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10568   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10569   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10570
10571   /* Emit code to initialize VRTOP, the top of the VR save area.
10572      This address is gr_save_area_bytes below GRTOP, rounded
10573      down to the next 16-byte boundary.  */
10574   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10575   vr_offset = ROUND_UP (gr_save_area_size,
10576                         STACK_BOUNDARY / BITS_PER_UNIT);
10577
10578   if (vr_offset)
10579     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10580   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10581   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10582
10583   /* Emit code to initialize GROFF, the offset from GRTOP of the
10584      next GPR argument.  */
10585   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10586               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10587   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10588
10589   /* Likewise emit code to initialize VROFF, the offset from FTOP
10590      of the next VR argument.  */
10591   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10592               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10593   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10594 }
10595
10596 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10597
10598 static tree
10599 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10600                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10601 {
10602   tree addr;
10603   bool indirect_p;
10604   bool is_ha;           /* is HFA or HVA.  */
10605   bool dw_align;        /* double-word align.  */
10606   machine_mode ag_mode = VOIDmode;
10607   int nregs;
10608   machine_mode mode;
10609
10610   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10611   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10612   HOST_WIDE_INT size, rsize, adjust, align;
10613   tree t, u, cond1, cond2;
10614
10615   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10616   if (indirect_p)
10617     type = build_pointer_type (type);
10618
10619   mode = TYPE_MODE (type);
10620
10621   f_stack = TYPE_FIELDS (va_list_type_node);
10622   f_grtop = DECL_CHAIN (f_stack);
10623   f_vrtop = DECL_CHAIN (f_grtop);
10624   f_groff = DECL_CHAIN (f_vrtop);
10625   f_vroff = DECL_CHAIN (f_groff);
10626
10627   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10628                   f_stack, NULL_TREE);
10629   size = int_size_in_bytes (type);
10630   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10631
10632   dw_align = false;
10633   adjust = 0;
10634   if (aarch64_vfp_is_call_or_return_candidate (mode,
10635                                                type,
10636                                                &ag_mode,
10637                                                &nregs,
10638                                                &is_ha))
10639     {
10640       /* TYPE passed in fp/simd registers.  */
10641       if (!TARGET_FLOAT)
10642         aarch64_err_no_fpadvsimd (mode, "varargs");
10643
10644       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10645                       unshare_expr (valist), f_vrtop, NULL_TREE);
10646       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10647                       unshare_expr (valist), f_vroff, NULL_TREE);
10648
10649       rsize = nregs * UNITS_PER_VREG;
10650
10651       if (is_ha)
10652         {
10653           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10654             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10655         }
10656       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10657                && size < UNITS_PER_VREG)
10658         {
10659           adjust = UNITS_PER_VREG - size;
10660         }
10661     }
10662   else
10663     {
10664       /* TYPE passed in general registers.  */
10665       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10666                       unshare_expr (valist), f_grtop, NULL_TREE);
10667       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10668                       unshare_expr (valist), f_groff, NULL_TREE);
10669       rsize = ROUND_UP (size, UNITS_PER_WORD);
10670       nregs = rsize / UNITS_PER_WORD;
10671
10672       if (align > 8)
10673         dw_align = true;
10674
10675       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10676           && size < UNITS_PER_WORD)
10677         {
10678           adjust = UNITS_PER_WORD  - size;
10679         }
10680     }
10681
10682   /* Get a local temporary for the field value.  */
10683   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10684
10685   /* Emit code to branch if off >= 0.  */
10686   t = build2 (GE_EXPR, boolean_type_node, off,
10687               build_int_cst (TREE_TYPE (off), 0));
10688   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10689
10690   if (dw_align)
10691     {
10692       /* Emit: offs = (offs + 15) & -16.  */
10693       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10694                   build_int_cst (TREE_TYPE (off), 15));
10695       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10696                   build_int_cst (TREE_TYPE (off), -16));
10697       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10698     }
10699   else
10700     roundup = NULL;
10701
10702   /* Update ap.__[g|v]r_offs  */
10703   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10704               build_int_cst (TREE_TYPE (off), rsize));
10705   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10706
10707   /* String up.  */
10708   if (roundup)
10709     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10710
10711   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10712   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10713               build_int_cst (TREE_TYPE (f_off), 0));
10714   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10715
10716   /* String up: make sure the assignment happens before the use.  */
10717   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10718   COND_EXPR_ELSE (cond1) = t;
10719
10720   /* Prepare the trees handling the argument that is passed on the stack;
10721      the top level node will store in ON_STACK.  */
10722   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10723   if (align > 8)
10724     {
10725       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10726       t = fold_convert (intDI_type_node, arg);
10727       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10728                   build_int_cst (TREE_TYPE (t), 15));
10729       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10730                   build_int_cst (TREE_TYPE (t), -16));
10731       t = fold_convert (TREE_TYPE (arg), t);
10732       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10733     }
10734   else
10735     roundup = NULL;
10736   /* Advance ap.__stack  */
10737   t = fold_convert (intDI_type_node, arg);
10738   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10739               build_int_cst (TREE_TYPE (t), size + 7));
10740   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10741               build_int_cst (TREE_TYPE (t), -8));
10742   t = fold_convert (TREE_TYPE (arg), t);
10743   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10744   /* String up roundup and advance.  */
10745   if (roundup)
10746     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10747   /* String up with arg */
10748   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10749   /* Big-endianness related address adjustment.  */
10750   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
10751       && size < UNITS_PER_WORD)
10752   {
10753     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10754                 size_int (UNITS_PER_WORD - size));
10755     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10756   }
10757
10758   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10759   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10760
10761   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10762   t = off;
10763   if (adjust)
10764     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10765                 build_int_cst (TREE_TYPE (off), adjust));
10766
10767   t = fold_convert (sizetype, t);
10768   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10769
10770   if (is_ha)
10771     {
10772       /* type ha; // treat as "struct {ftype field[n];}"
10773          ... [computing offs]
10774          for (i = 0; i <nregs; ++i, offs += 16)
10775            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10776          return ha;  */
10777       int i;
10778       tree tmp_ha, field_t, field_ptr_t;
10779
10780       /* Declare a local variable.  */
10781       tmp_ha = create_tmp_var_raw (type, "ha");
10782       gimple_add_tmp_var (tmp_ha);
10783
10784       /* Establish the base type.  */
10785       switch (ag_mode)
10786         {
10787         case E_SFmode:
10788           field_t = float_type_node;
10789           field_ptr_t = float_ptr_type_node;
10790           break;
10791         case E_DFmode:
10792           field_t = double_type_node;
10793           field_ptr_t = double_ptr_type_node;
10794           break;
10795         case E_TFmode:
10796           field_t = long_double_type_node;
10797           field_ptr_t = long_double_ptr_type_node;
10798           break;
10799         case E_HFmode:
10800           field_t = aarch64_fp16_type_node;
10801           field_ptr_t = aarch64_fp16_ptr_type_node;
10802           break;
10803         case E_V2SImode:
10804         case E_V4SImode:
10805             {
10806               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10807               field_t = build_vector_type_for_mode (innertype, ag_mode);
10808               field_ptr_t = build_pointer_type (field_t);
10809             }
10810           break;
10811         default:
10812           gcc_assert (0);
10813         }
10814
10815       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10816       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10817       addr = t;
10818       t = fold_convert (field_ptr_t, addr);
10819       t = build2 (MODIFY_EXPR, field_t,
10820                   build1 (INDIRECT_REF, field_t, tmp_ha),
10821                   build1 (INDIRECT_REF, field_t, t));
10822
10823       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10824       for (i = 1; i < nregs; ++i)
10825         {
10826           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10827           u = fold_convert (field_ptr_t, addr);
10828           u = build2 (MODIFY_EXPR, field_t,
10829                       build2 (MEM_REF, field_t, tmp_ha,
10830                               build_int_cst (field_ptr_t,
10831                                              (i *
10832                                               int_size_in_bytes (field_t)))),
10833                       build1 (INDIRECT_REF, field_t, u));
10834           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10835         }
10836
10837       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10838       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10839     }
10840
10841   COND_EXPR_ELSE (cond2) = t;
10842   addr = fold_convert (build_pointer_type (type), cond1);
10843   addr = build_va_arg_indirect_ref (addr);
10844
10845   if (indirect_p)
10846     addr = build_va_arg_indirect_ref (addr);
10847
10848   return addr;
10849 }
10850
10851 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10852
10853 static void
10854 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10855                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10856                                 int no_rtl)
10857 {
10858   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10859   CUMULATIVE_ARGS local_cum;
10860   int gr_saved = cfun->va_list_gpr_size;
10861   int vr_saved = cfun->va_list_fpr_size;
10862
10863   /* The caller has advanced CUM up to, but not beyond, the last named
10864      argument.  Advance a local copy of CUM past the last "real" named
10865      argument, to find out how many registers are left over.  */
10866   local_cum = *cum;
10867   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10868
10869   /* Found out how many registers we need to save.
10870      Honor tree-stdvar analysis results.  */
10871   if (cfun->va_list_gpr_size)
10872     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10873                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10874   if (cfun->va_list_fpr_size)
10875     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10876                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10877
10878   if (!TARGET_FLOAT)
10879     {
10880       gcc_assert (local_cum.aapcs_nvrn == 0);
10881       vr_saved = 0;
10882     }
10883
10884   if (!no_rtl)
10885     {
10886       if (gr_saved > 0)
10887         {
10888           rtx ptr, mem;
10889
10890           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10891           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10892                                - gr_saved * UNITS_PER_WORD);
10893           mem = gen_frame_mem (BLKmode, ptr);
10894           set_mem_alias_set (mem, get_varargs_alias_set ());
10895
10896           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10897                                mem, gr_saved);
10898         }
10899       if (vr_saved > 0)
10900         {
10901           /* We can't use move_block_from_reg, because it will use
10902              the wrong mode, storing D regs only.  */
10903           machine_mode mode = TImode;
10904           int off, i, vr_start;
10905
10906           /* Set OFF to the offset from virtual_incoming_args_rtx of
10907              the first vector register.  The VR save area lies below
10908              the GR one, and is aligned to 16 bytes.  */
10909           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10910                            STACK_BOUNDARY / BITS_PER_UNIT);
10911           off -= vr_saved * UNITS_PER_VREG;
10912
10913           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10914           for (i = 0; i < vr_saved; ++i)
10915             {
10916               rtx ptr, mem;
10917
10918               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10919               mem = gen_frame_mem (mode, ptr);
10920               set_mem_alias_set (mem, get_varargs_alias_set ());
10921               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10922               off += UNITS_PER_VREG;
10923             }
10924         }
10925     }
10926
10927   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10928      any complication of having crtl->args.pretend_args_size changed.  */
10929   cfun->machine->frame.saved_varargs_size
10930     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10931                  STACK_BOUNDARY / BITS_PER_UNIT)
10932        + vr_saved * UNITS_PER_VREG);
10933 }
10934
10935 static void
10936 aarch64_conditional_register_usage (void)
10937 {
10938   int i;
10939   if (!TARGET_FLOAT)
10940     {
10941       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10942         {
10943           fixed_regs[i] = 1;
10944           call_used_regs[i] = 1;
10945         }
10946     }
10947 }
10948
10949 /* Walk down the type tree of TYPE counting consecutive base elements.
10950    If *MODEP is VOIDmode, then set it to the first valid floating point
10951    type.  If a non-floating point type is found, or if a floating point
10952    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10953    otherwise return the count in the sub-tree.  */
10954 static int
10955 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10956 {
10957   machine_mode mode;
10958   HOST_WIDE_INT size;
10959
10960   switch (TREE_CODE (type))
10961     {
10962     case REAL_TYPE:
10963       mode = TYPE_MODE (type);
10964       if (mode != DFmode && mode != SFmode
10965           && mode != TFmode && mode != HFmode)
10966         return -1;
10967
10968       if (*modep == VOIDmode)
10969         *modep = mode;
10970
10971       if (*modep == mode)
10972         return 1;
10973
10974       break;
10975
10976     case COMPLEX_TYPE:
10977       mode = TYPE_MODE (TREE_TYPE (type));
10978       if (mode != DFmode && mode != SFmode
10979           && mode != TFmode && mode != HFmode)
10980         return -1;
10981
10982       if (*modep == VOIDmode)
10983         *modep = mode;
10984
10985       if (*modep == mode)
10986         return 2;
10987
10988       break;
10989
10990     case VECTOR_TYPE:
10991       /* Use V2SImode and V4SImode as representatives of all 64-bit
10992          and 128-bit vector types.  */
10993       size = int_size_in_bytes (type);
10994       switch (size)
10995         {
10996         case 8:
10997           mode = V2SImode;
10998           break;
10999         case 16:
11000           mode = V4SImode;
11001           break;
11002         default:
11003           return -1;
11004         }
11005
11006       if (*modep == VOIDmode)
11007         *modep = mode;
11008
11009       /* Vector modes are considered to be opaque: two vectors are
11010          equivalent for the purposes of being homogeneous aggregates
11011          if they are the same size.  */
11012       if (*modep == mode)
11013         return 1;
11014
11015       break;
11016
11017     case ARRAY_TYPE:
11018       {
11019         int count;
11020         tree index = TYPE_DOMAIN (type);
11021
11022         /* Can't handle incomplete types nor sizes that are not
11023            fixed.  */
11024         if (!COMPLETE_TYPE_P (type)
11025             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11026           return -1;
11027
11028         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
11029         if (count == -1
11030             || !index
11031             || !TYPE_MAX_VALUE (index)
11032             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
11033             || !TYPE_MIN_VALUE (index)
11034             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
11035             || count < 0)
11036           return -1;
11037
11038         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
11039                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
11040
11041         /* There must be no padding.  */
11042         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11043           return -1;
11044
11045         return count;
11046       }
11047
11048     case RECORD_TYPE:
11049       {
11050         int count = 0;
11051         int sub_count;
11052         tree field;
11053
11054         /* Can't handle incomplete types nor sizes that are not
11055            fixed.  */
11056         if (!COMPLETE_TYPE_P (type)
11057             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11058           return -1;
11059
11060         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11061           {
11062             if (TREE_CODE (field) != FIELD_DECL)
11063               continue;
11064
11065             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11066             if (sub_count < 0)
11067               return -1;
11068             count += sub_count;
11069           }
11070
11071         /* There must be no padding.  */
11072         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11073           return -1;
11074
11075         return count;
11076       }
11077
11078     case UNION_TYPE:
11079     case QUAL_UNION_TYPE:
11080       {
11081         /* These aren't very interesting except in a degenerate case.  */
11082         int count = 0;
11083         int sub_count;
11084         tree field;
11085
11086         /* Can't handle incomplete types nor sizes that are not
11087            fixed.  */
11088         if (!COMPLETE_TYPE_P (type)
11089             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11090           return -1;
11091
11092         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11093           {
11094             if (TREE_CODE (field) != FIELD_DECL)
11095               continue;
11096
11097             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11098             if (sub_count < 0)
11099               return -1;
11100             count = count > sub_count ? count : sub_count;
11101           }
11102
11103         /* There must be no padding.  */
11104         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11105           return -1;
11106
11107         return count;
11108       }
11109
11110     default:
11111       break;
11112     }
11113
11114   return -1;
11115 }
11116
11117 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11118    type as described in AAPCS64 \S 4.1.2.
11119
11120    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11121
11122 static bool
11123 aarch64_short_vector_p (const_tree type,
11124                         machine_mode mode)
11125 {
11126   HOST_WIDE_INT size = -1;
11127
11128   if (type && TREE_CODE (type) == VECTOR_TYPE)
11129     size = int_size_in_bytes (type);
11130   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11131             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11132     size = GET_MODE_SIZE (mode);
11133
11134   return (size == 8 || size == 16);
11135 }
11136
11137 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11138    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11139    array types.  The C99 floating-point complex types are also considered
11140    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11141    types, which are GCC extensions and out of the scope of AAPCS64, are
11142    treated as composite types here as well.
11143
11144    Note that MODE itself is not sufficient in determining whether a type
11145    is such a composite type or not.  This is because
11146    stor-layout.c:compute_record_mode may have already changed the MODE
11147    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11148    structure with only one field may have its MODE set to the mode of the
11149    field.  Also an integer mode whose size matches the size of the
11150    RECORD_TYPE type may be used to substitute the original mode
11151    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11152    solely relied on.  */
11153
11154 static bool
11155 aarch64_composite_type_p (const_tree type,
11156                           machine_mode mode)
11157 {
11158   if (aarch64_short_vector_p (type, mode))
11159     return false;
11160
11161   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11162     return true;
11163
11164   if (mode == BLKmode
11165       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11166       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11167     return true;
11168
11169   return false;
11170 }
11171
11172 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11173    shall be passed or returned in simd/fp register(s) (providing these
11174    parameter passing registers are available).
11175
11176    Upon successful return, *COUNT returns the number of needed registers,
11177    *BASE_MODE returns the mode of the individual register and when IS_HAF
11178    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11179    floating-point aggregate or a homogeneous short-vector aggregate.  */
11180
11181 static bool
11182 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11183                                          const_tree type,
11184                                          machine_mode *base_mode,
11185                                          int *count,
11186                                          bool *is_ha)
11187 {
11188   machine_mode new_mode = VOIDmode;
11189   bool composite_p = aarch64_composite_type_p (type, mode);
11190
11191   if (is_ha != NULL) *is_ha = false;
11192
11193   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11194       || aarch64_short_vector_p (type, mode))
11195     {
11196       *count = 1;
11197       new_mode = mode;
11198     }
11199   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11200     {
11201       if (is_ha != NULL) *is_ha = true;
11202       *count = 2;
11203       new_mode = GET_MODE_INNER (mode);
11204     }
11205   else if (type && composite_p)
11206     {
11207       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11208
11209       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11210         {
11211           if (is_ha != NULL) *is_ha = true;
11212           *count = ag_count;
11213         }
11214       else
11215         return false;
11216     }
11217   else
11218     return false;
11219
11220   *base_mode = new_mode;
11221   return true;
11222 }
11223
11224 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11225
11226 static rtx
11227 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11228                           int incoming ATTRIBUTE_UNUSED)
11229 {
11230   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11231 }
11232
11233 /* Implements target hook vector_mode_supported_p.  */
11234 static bool
11235 aarch64_vector_mode_supported_p (machine_mode mode)
11236 {
11237   if (TARGET_SIMD
11238       && (mode == V4SImode  || mode == V8HImode
11239           || mode == V16QImode || mode == V2DImode
11240           || mode == V2SImode  || mode == V4HImode
11241           || mode == V8QImode || mode == V2SFmode
11242           || mode == V4SFmode || mode == V2DFmode
11243           || mode == V4HFmode || mode == V8HFmode
11244           || mode == V1DFmode))
11245     return true;
11246
11247   return false;
11248 }
11249
11250 /* Return appropriate SIMD container
11251    for MODE within a vector of WIDTH bits.  */
11252 static machine_mode
11253 aarch64_simd_container_mode (scalar_mode mode, unsigned width)
11254 {
11255   gcc_assert (width == 64 || width == 128);
11256   if (TARGET_SIMD)
11257     {
11258       if (width == 128)
11259         switch (mode)
11260           {
11261           case E_DFmode:
11262             return V2DFmode;
11263           case E_SFmode:
11264             return V4SFmode;
11265           case E_HFmode:
11266             return V8HFmode;
11267           case E_SImode:
11268             return V4SImode;
11269           case E_HImode:
11270             return V8HImode;
11271           case E_QImode:
11272             return V16QImode;
11273           case E_DImode:
11274             return V2DImode;
11275           default:
11276             break;
11277           }
11278       else
11279         switch (mode)
11280           {
11281           case E_SFmode:
11282             return V2SFmode;
11283           case E_HFmode:
11284             return V4HFmode;
11285           case E_SImode:
11286             return V2SImode;
11287           case E_HImode:
11288             return V4HImode;
11289           case E_QImode:
11290             return V8QImode;
11291           default:
11292             break;
11293           }
11294     }
11295   return word_mode;
11296 }
11297
11298 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11299 static machine_mode
11300 aarch64_preferred_simd_mode (scalar_mode mode)
11301 {
11302   return aarch64_simd_container_mode (mode, 128);
11303 }
11304
11305 /* Return the bitmask of possible vector sizes for the vectorizer
11306    to iterate over.  */
11307 static unsigned int
11308 aarch64_autovectorize_vector_sizes (void)
11309 {
11310   return (16 | 8);
11311 }
11312
11313 /* Implement TARGET_MANGLE_TYPE.  */
11314
11315 static const char *
11316 aarch64_mangle_type (const_tree type)
11317 {
11318   /* The AArch64 ABI documents say that "__va_list" has to be
11319      managled as if it is in the "std" namespace.  */
11320   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11321     return "St9__va_list";
11322
11323   /* Half-precision float.  */
11324   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11325     return "Dh";
11326
11327   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11328      builtin types.  */
11329   if (TYPE_NAME (type) != NULL)
11330     return aarch64_mangle_builtin_type (type);
11331
11332   /* Use the default mangling.  */
11333   return NULL;
11334 }
11335
11336 /* Find the first rtx_insn before insn that will generate an assembly
11337    instruction.  */
11338
11339 static rtx_insn *
11340 aarch64_prev_real_insn (rtx_insn *insn)
11341 {
11342   if (!insn)
11343     return NULL;
11344
11345   do
11346     {
11347       insn = prev_real_insn (insn);
11348     }
11349   while (insn && recog_memoized (insn) < 0);
11350
11351   return insn;
11352 }
11353
11354 static bool
11355 is_madd_op (enum attr_type t1)
11356 {
11357   unsigned int i;
11358   /* A number of these may be AArch32 only.  */
11359   enum attr_type mlatypes[] = {
11360     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11361     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11362     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11363   };
11364
11365   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11366     {
11367       if (t1 == mlatypes[i])
11368         return true;
11369     }
11370
11371   return false;
11372 }
11373
11374 /* Check if there is a register dependency between a load and the insn
11375    for which we hold recog_data.  */
11376
11377 static bool
11378 dep_between_memop_and_curr (rtx memop)
11379 {
11380   rtx load_reg;
11381   int opno;
11382
11383   gcc_assert (GET_CODE (memop) == SET);
11384
11385   if (!REG_P (SET_DEST (memop)))
11386     return false;
11387
11388   load_reg = SET_DEST (memop);
11389   for (opno = 1; opno < recog_data.n_operands; opno++)
11390     {
11391       rtx operand = recog_data.operand[opno];
11392       if (REG_P (operand)
11393           && reg_overlap_mentioned_p (load_reg, operand))
11394         return true;
11395
11396     }
11397   return false;
11398 }
11399
11400
11401 /* When working around the Cortex-A53 erratum 835769,
11402    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11403    instruction and has a preceding memory instruction such that a NOP
11404    should be inserted between them.  */
11405
11406 bool
11407 aarch64_madd_needs_nop (rtx_insn* insn)
11408 {
11409   enum attr_type attr_type;
11410   rtx_insn *prev;
11411   rtx body;
11412
11413   if (!TARGET_FIX_ERR_A53_835769)
11414     return false;
11415
11416   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11417     return false;
11418
11419   attr_type = get_attr_type (insn);
11420   if (!is_madd_op (attr_type))
11421     return false;
11422
11423   prev = aarch64_prev_real_insn (insn);
11424   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11425      Restore recog state to INSN to avoid state corruption.  */
11426   extract_constrain_insn_cached (insn);
11427
11428   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11429     return false;
11430
11431   body = single_set (prev);
11432
11433   /* If the previous insn is a memory op and there is no dependency between
11434      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11435      have a complex memory operation, probably a load/store pair.
11436      Be conservative for now and emit a NOP.  */
11437   if (GET_MODE (recog_data.operand[0]) == DImode
11438       && (!body || !dep_between_memop_and_curr (body)))
11439     return true;
11440
11441   return false;
11442
11443 }
11444
11445
11446 /* Implement FINAL_PRESCAN_INSN.  */
11447
11448 void
11449 aarch64_final_prescan_insn (rtx_insn *insn)
11450 {
11451   if (aarch64_madd_needs_nop (insn))
11452     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11453 }
11454
11455
11456 /* Return the equivalent letter for size.  */
11457 static char
11458 sizetochar (int size)
11459 {
11460   switch (size)
11461     {
11462     case 64: return 'd';
11463     case 32: return 's';
11464     case 16: return 'h';
11465     case 8 : return 'b';
11466     default: gcc_unreachable ();
11467     }
11468 }
11469
11470 /* Return true iff x is a uniform vector of floating-point
11471    constants, and the constant can be represented in
11472    quarter-precision form.  Note, as aarch64_float_const_representable
11473    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11474 static bool
11475 aarch64_vect_float_const_representable_p (rtx x)
11476 {
11477   rtx elt;
11478   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11479           && const_vec_duplicate_p (x, &elt)
11480           && aarch64_float_const_representable_p (elt));
11481 }
11482
11483 /* Return true for valid and false for invalid.  */
11484 bool
11485 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11486                               struct simd_immediate_info *info)
11487 {
11488 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11489   matches = 1;                                          \
11490   for (i = 0; i < idx; i += (STRIDE))                   \
11491     if (!(TEST))                                        \
11492       matches = 0;                                      \
11493   if (matches)                                          \
11494     {                                                   \
11495       immtype = (CLASS);                                \
11496       elsize = (ELSIZE);                                \
11497       eshift = (SHIFT);                                 \
11498       emvn = (NEG);                                     \
11499       break;                                            \
11500     }
11501
11502   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11503   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11504   unsigned char bytes[16];
11505   int immtype = -1, matches;
11506   unsigned int invmask = inverse ? 0xff : 0;
11507   int eshift, emvn;
11508
11509   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11510     {
11511       if (! (aarch64_simd_imm_zero_p (op, mode)
11512              || aarch64_vect_float_const_representable_p (op)))
11513         return false;
11514
11515       if (info)
11516         {
11517           rtx elt = CONST_VECTOR_ELT (op, 0);
11518           scalar_float_mode elt_mode
11519             = as_a <scalar_float_mode> (GET_MODE (elt));
11520
11521           info->value = elt;
11522           info->element_width = GET_MODE_BITSIZE (elt_mode);
11523           info->mvn = false;
11524           info->shift = 0;
11525         }
11526
11527       return true;
11528     }
11529
11530   /* Splat vector constant out into a byte vector.  */
11531   for (i = 0; i < n_elts; i++)
11532     {
11533       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11534          it must be laid out in the vector register in reverse order.  */
11535       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11536       unsigned HOST_WIDE_INT elpart;
11537
11538       gcc_assert (CONST_INT_P (el));
11539       elpart = INTVAL (el);
11540
11541       for (unsigned int byte = 0; byte < innersize; byte++)
11542         {
11543           bytes[idx++] = (elpart & 0xff) ^ invmask;
11544           elpart >>= BITS_PER_UNIT;
11545         }
11546
11547     }
11548
11549   /* Sanity check.  */
11550   gcc_assert (idx == GET_MODE_SIZE (mode));
11551
11552   do
11553     {
11554       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11555              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11556
11557       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11558              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11559
11560       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11561              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11562
11563       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11564              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11565
11566       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11567
11568       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11569
11570       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11571              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11572
11573       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11574              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11575
11576       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11577              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11578
11579       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11580              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11581
11582       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11583
11584       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11585
11586       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11587              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11588
11589       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11590              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11591
11592       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11593              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11594
11595       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11596              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11597
11598       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11599
11600       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11601              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11602     }
11603   while (0);
11604
11605   if (immtype == -1)
11606     return false;
11607
11608   if (info)
11609     {
11610       info->element_width = elsize;
11611       info->mvn = emvn != 0;
11612       info->shift = eshift;
11613
11614       unsigned HOST_WIDE_INT imm = 0;
11615
11616       if (immtype >= 12 && immtype <= 15)
11617         info->msl = true;
11618
11619       /* Un-invert bytes of recognized vector, if necessary.  */
11620       if (invmask != 0)
11621         for (i = 0; i < idx; i++)
11622           bytes[i] ^= invmask;
11623
11624       if (immtype == 17)
11625         {
11626           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11627           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11628
11629           for (i = 0; i < 8; i++)
11630             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11631               << (i * BITS_PER_UNIT);
11632
11633
11634           info->value = GEN_INT (imm);
11635         }
11636       else
11637         {
11638           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11639             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11640
11641           /* Construct 'abcdefgh' because the assembler cannot handle
11642              generic constants.  */
11643           if (info->mvn)
11644             imm = ~imm;
11645           imm = (imm >> info->shift) & 0xff;
11646           info->value = GEN_INT (imm);
11647         }
11648     }
11649
11650   return true;
11651 #undef CHECK
11652 }
11653
11654 /* Check of immediate shift constants are within range.  */
11655 bool
11656 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11657 {
11658   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11659   if (left)
11660     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11661   else
11662     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11663 }
11664
11665 /* Return true if X is a uniform vector where all elements
11666    are either the floating-point constant 0.0 or the
11667    integer constant 0.  */
11668 bool
11669 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11670 {
11671   return x == CONST0_RTX (mode);
11672 }
11673
11674
11675 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11676    operation of width WIDTH at bit position POS.  */
11677
11678 rtx
11679 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11680 {
11681   gcc_assert (CONST_INT_P (width));
11682   gcc_assert (CONST_INT_P (pos));
11683
11684   unsigned HOST_WIDE_INT mask
11685     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11686   return GEN_INT (mask << UINTVAL (pos));
11687 }
11688
11689 bool
11690 aarch64_mov_operand_p (rtx x, machine_mode mode)
11691 {
11692   if (GET_CODE (x) == HIGH
11693       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11694     return true;
11695
11696   if (CONST_INT_P (x))
11697     return true;
11698
11699   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11700     return true;
11701
11702   return aarch64_classify_symbolic_expression (x)
11703     == SYMBOL_TINY_ABSOLUTE;
11704 }
11705
11706 /* Return a const_int vector of VAL.  */
11707 rtx
11708 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11709 {
11710   int nunits = GET_MODE_NUNITS (mode);
11711   rtvec v = rtvec_alloc (nunits);
11712   int i;
11713
11714   rtx cache = GEN_INT (val);
11715
11716   for (i=0; i < nunits; i++)
11717     RTVEC_ELT (v, i) = cache;
11718
11719   return gen_rtx_CONST_VECTOR (mode, v);
11720 }
11721
11722 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11723
11724 bool
11725 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
11726 {
11727   machine_mode vmode;
11728
11729   vmode = aarch64_preferred_simd_mode (mode);
11730   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11731   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11732 }
11733
11734 /* Construct and return a PARALLEL RTX vector with elements numbering the
11735    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11736    the vector - from the perspective of the architecture.  This does not
11737    line up with GCC's perspective on lane numbers, so we end up with
11738    different masks depending on our target endian-ness.  The diagram
11739    below may help.  We must draw the distinction when building masks
11740    which select one half of the vector.  An instruction selecting
11741    architectural low-lanes for a big-endian target, must be described using
11742    a mask selecting GCC high-lanes.
11743
11744                  Big-Endian             Little-Endian
11745
11746 GCC             0   1   2   3           3   2   1   0
11747               | x | x | x | x |       | x | x | x | x |
11748 Architecture    3   2   1   0           3   2   1   0
11749
11750 Low Mask:         { 2, 3 }                { 0, 1 }
11751 High Mask:        { 0, 1 }                { 2, 3 }
11752 */
11753
11754 rtx
11755 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11756 {
11757   int nunits = GET_MODE_NUNITS (mode);
11758   rtvec v = rtvec_alloc (nunits / 2);
11759   int high_base = nunits / 2;
11760   int low_base = 0;
11761   int base;
11762   rtx t1;
11763   int i;
11764
11765   if (BYTES_BIG_ENDIAN)
11766     base = high ? low_base : high_base;
11767   else
11768     base = high ? high_base : low_base;
11769
11770   for (i = 0; i < nunits / 2; i++)
11771     RTVEC_ELT (v, i) = GEN_INT (base + i);
11772
11773   t1 = gen_rtx_PARALLEL (mode, v);
11774   return t1;
11775 }
11776
11777 /* Check OP for validity as a PARALLEL RTX vector with elements
11778    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11779    from the perspective of the architecture.  See the diagram above
11780    aarch64_simd_vect_par_cnst_half for more details.  */
11781
11782 bool
11783 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11784                                        bool high)
11785 {
11786   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11787   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11788   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11789   int i = 0;
11790
11791   if (!VECTOR_MODE_P (mode))
11792     return false;
11793
11794   if (count_op != count_ideal)
11795     return false;
11796
11797   for (i = 0; i < count_ideal; i++)
11798     {
11799       rtx elt_op = XVECEXP (op, 0, i);
11800       rtx elt_ideal = XVECEXP (ideal, 0, i);
11801
11802       if (!CONST_INT_P (elt_op)
11803           || INTVAL (elt_ideal) != INTVAL (elt_op))
11804         return false;
11805     }
11806   return true;
11807 }
11808
11809 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11810    HIGH (exclusive).  */
11811 void
11812 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11813                           const_tree exp)
11814 {
11815   HOST_WIDE_INT lane;
11816   gcc_assert (CONST_INT_P (operand));
11817   lane = INTVAL (operand);
11818
11819   if (lane < low || lane >= high)
11820   {
11821     if (exp)
11822       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11823     else
11824       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11825   }
11826 }
11827
11828 /* Return TRUE if OP is a valid vector addressing mode.  */
11829 bool
11830 aarch64_simd_mem_operand_p (rtx op)
11831 {
11832   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11833                         || REG_P (XEXP (op, 0)));
11834 }
11835
11836 /* Emit a register copy from operand to operand, taking care not to
11837    early-clobber source registers in the process.
11838
11839    COUNT is the number of components into which the copy needs to be
11840    decomposed.  */
11841 void
11842 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11843                                 unsigned int count)
11844 {
11845   unsigned int i;
11846   int rdest = REGNO (operands[0]);
11847   int rsrc = REGNO (operands[1]);
11848
11849   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11850       || rdest < rsrc)
11851     for (i = 0; i < count; i++)
11852       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11853                       gen_rtx_REG (mode, rsrc + i));
11854   else
11855     for (i = 0; i < count; i++)
11856       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11857                       gen_rtx_REG (mode, rsrc + count - i - 1));
11858 }
11859
11860 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11861    one of VSTRUCT modes: OI, CI, or XI.  */
11862 int
11863 aarch64_simd_attr_length_rglist (machine_mode mode)
11864 {
11865   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11866 }
11867
11868 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11869    alignment of a vector to 128 bits.  */
11870 static HOST_WIDE_INT
11871 aarch64_simd_vector_alignment (const_tree type)
11872 {
11873   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11874   return MIN (align, 128);
11875 }
11876
11877 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11878 static bool
11879 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11880 {
11881   if (is_packed)
11882     return false;
11883
11884   /* We guarantee alignment for vectors up to 128-bits.  */
11885   if (tree_int_cst_compare (TYPE_SIZE (type),
11886                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11887     return false;
11888
11889   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11890   return true;
11891 }
11892
11893 /* Return true if the vector misalignment factor is supported by the
11894    target.  */
11895 static bool
11896 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11897                                              const_tree type, int misalignment,
11898                                              bool is_packed)
11899 {
11900   if (TARGET_SIMD && STRICT_ALIGNMENT)
11901     {
11902       /* Return if movmisalign pattern is not supported for this mode.  */
11903       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11904         return false;
11905
11906       /* Misalignment factor is unknown at compile time.  */
11907       if (misalignment == -1)
11908         return false;
11909     }
11910   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11911                                                       is_packed);
11912 }
11913
11914 /* If VALS is a vector constant that can be loaded into a register
11915    using DUP, generate instructions to do so and return an RTX to
11916    assign to the register.  Otherwise return NULL_RTX.  */
11917 static rtx
11918 aarch64_simd_dup_constant (rtx vals)
11919 {
11920   machine_mode mode = GET_MODE (vals);
11921   machine_mode inner_mode = GET_MODE_INNER (mode);
11922   rtx x;
11923
11924   if (!const_vec_duplicate_p (vals, &x))
11925     return NULL_RTX;
11926
11927   /* We can load this constant by using DUP and a constant in a
11928      single ARM register.  This will be cheaper than a vector
11929      load.  */
11930   x = copy_to_mode_reg (inner_mode, x);
11931   return gen_rtx_VEC_DUPLICATE (mode, x);
11932 }
11933
11934
11935 /* Generate code to load VALS, which is a PARALLEL containing only
11936    constants (for vec_init) or CONST_VECTOR, efficiently into a
11937    register.  Returns an RTX to copy into the register, or NULL_RTX
11938    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11939 static rtx
11940 aarch64_simd_make_constant (rtx vals)
11941 {
11942   machine_mode mode = GET_MODE (vals);
11943   rtx const_dup;
11944   rtx const_vec = NULL_RTX;
11945   int n_elts = GET_MODE_NUNITS (mode);
11946   int n_const = 0;
11947   int i;
11948
11949   if (GET_CODE (vals) == CONST_VECTOR)
11950     const_vec = vals;
11951   else if (GET_CODE (vals) == PARALLEL)
11952     {
11953       /* A CONST_VECTOR must contain only CONST_INTs and
11954          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11955          Only store valid constants in a CONST_VECTOR.  */
11956       for (i = 0; i < n_elts; ++i)
11957         {
11958           rtx x = XVECEXP (vals, 0, i);
11959           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11960             n_const++;
11961         }
11962       if (n_const == n_elts)
11963         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11964     }
11965   else
11966     gcc_unreachable ();
11967
11968   if (const_vec != NULL_RTX
11969       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11970     /* Load using MOVI/MVNI.  */
11971     return const_vec;
11972   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11973     /* Loaded using DUP.  */
11974     return const_dup;
11975   else if (const_vec != NULL_RTX)
11976     /* Load from constant pool. We can not take advantage of single-cycle
11977        LD1 because we need a PC-relative addressing mode.  */
11978     return const_vec;
11979   else
11980     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11981        We can not construct an initializer.  */
11982     return NULL_RTX;
11983 }
11984
11985 /* Expand a vector initialisation sequence, such that TARGET is
11986    initialised to contain VALS.  */
11987
11988 void
11989 aarch64_expand_vector_init (rtx target, rtx vals)
11990 {
11991   machine_mode mode = GET_MODE (target);
11992   scalar_mode inner_mode = GET_MODE_INNER (mode);
11993   /* The number of vector elements.  */
11994   int n_elts = GET_MODE_NUNITS (mode);
11995   /* The number of vector elements which are not constant.  */
11996   int n_var = 0;
11997   rtx any_const = NULL_RTX;
11998   /* The first element of vals.  */
11999   rtx v0 = XVECEXP (vals, 0, 0);
12000   bool all_same = true;
12001
12002   /* Count the number of variable elements to initialise.  */
12003   for (int i = 0; i < n_elts; ++i)
12004     {
12005       rtx x = XVECEXP (vals, 0, i);
12006       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
12007         ++n_var;
12008       else
12009         any_const = x;
12010
12011       all_same &= rtx_equal_p (x, v0);
12012     }
12013
12014   /* No variable elements, hand off to aarch64_simd_make_constant which knows
12015      how best to handle this.  */
12016   if (n_var == 0)
12017     {
12018       rtx constant = aarch64_simd_make_constant (vals);
12019       if (constant != NULL_RTX)
12020         {
12021           emit_move_insn (target, constant);
12022           return;
12023         }
12024     }
12025
12026   /* Splat a single non-constant element if we can.  */
12027   if (all_same)
12028     {
12029       rtx x = copy_to_mode_reg (inner_mode, v0);
12030       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12031       return;
12032     }
12033
12034   enum insn_code icode = optab_handler (vec_set_optab, mode);
12035   gcc_assert (icode != CODE_FOR_nothing);
12036
12037   /* If there are only variable elements, try to optimize
12038      the insertion using dup for the most common element
12039      followed by insertions.  */
12040
12041   /* The algorithm will fill matches[*][0] with the earliest matching element,
12042      and matches[X][1] with the count of duplicate elements (if X is the
12043      earliest element which has duplicates).  */
12044
12045   if (n_var == n_elts && n_elts <= 16)
12046     {
12047       int matches[16][2] = {0};
12048       for (int i = 0; i < n_elts; i++)
12049         {
12050           for (int j = 0; j <= i; j++)
12051             {
12052               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12053                 {
12054                   matches[i][0] = j;
12055                   matches[j][1]++;
12056                   break;
12057                 }
12058             }
12059         }
12060       int maxelement = 0;
12061       int maxv = 0;
12062       for (int i = 0; i < n_elts; i++)
12063         if (matches[i][1] > maxv)
12064           {
12065             maxelement = i;
12066             maxv = matches[i][1];
12067           }
12068
12069       /* Create a duplicate of the most common element.  */
12070       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12071       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12072
12073       /* Insert the rest.  */
12074       for (int i = 0; i < n_elts; i++)
12075         {
12076           rtx x = XVECEXP (vals, 0, i);
12077           if (matches[i][0] == maxelement)
12078             continue;
12079           x = copy_to_mode_reg (inner_mode, x);
12080           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12081         }
12082       return;
12083     }
12084
12085   /* Initialise a vector which is part-variable.  We want to first try
12086      to build those lanes which are constant in the most efficient way we
12087      can.  */
12088   if (n_var != n_elts)
12089     {
12090       rtx copy = copy_rtx (vals);
12091
12092       /* Load constant part of vector.  We really don't care what goes into the
12093          parts we will overwrite, but we're more likely to be able to load the
12094          constant efficiently if it has fewer, larger, repeating parts
12095          (see aarch64_simd_valid_immediate).  */
12096       for (int i = 0; i < n_elts; i++)
12097         {
12098           rtx x = XVECEXP (vals, 0, i);
12099           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12100             continue;
12101           rtx subst = any_const;
12102           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12103             {
12104               /* Look in the copied vector, as more elements are const.  */
12105               rtx test = XVECEXP (copy, 0, i ^ bit);
12106               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12107                 {
12108                   subst = test;
12109                   break;
12110                 }
12111             }
12112           XVECEXP (copy, 0, i) = subst;
12113         }
12114       aarch64_expand_vector_init (target, copy);
12115     }
12116
12117   /* Insert the variable lanes directly.  */
12118   for (int i = 0; i < n_elts; i++)
12119     {
12120       rtx x = XVECEXP (vals, 0, i);
12121       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12122         continue;
12123       x = copy_to_mode_reg (inner_mode, x);
12124       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12125     }
12126 }
12127
12128 static unsigned HOST_WIDE_INT
12129 aarch64_shift_truncation_mask (machine_mode mode)
12130 {
12131   return
12132     (!SHIFT_COUNT_TRUNCATED
12133      || aarch64_vector_mode_supported_p (mode)
12134      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12135 }
12136
12137 /* Select a format to encode pointers in exception handling data.  */
12138 int
12139 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12140 {
12141    int type;
12142    switch (aarch64_cmodel)
12143      {
12144      case AARCH64_CMODEL_TINY:
12145      case AARCH64_CMODEL_TINY_PIC:
12146      case AARCH64_CMODEL_SMALL:
12147      case AARCH64_CMODEL_SMALL_PIC:
12148      case AARCH64_CMODEL_SMALL_SPIC:
12149        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12150           for everything.  */
12151        type = DW_EH_PE_sdata4;
12152        break;
12153      default:
12154        /* No assumptions here.  8-byte relocs required.  */
12155        type = DW_EH_PE_sdata8;
12156        break;
12157      }
12158    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12159 }
12160
12161 /* The last .arch and .tune assembly strings that we printed.  */
12162 static std::string aarch64_last_printed_arch_string;
12163 static std::string aarch64_last_printed_tune_string;
12164
12165 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12166    by the function fndecl.  */
12167
12168 void
12169 aarch64_declare_function_name (FILE *stream, const char* name,
12170                                 tree fndecl)
12171 {
12172   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12173
12174   struct cl_target_option *targ_options;
12175   if (target_parts)
12176     targ_options = TREE_TARGET_OPTION (target_parts);
12177   else
12178     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12179   gcc_assert (targ_options);
12180
12181   const struct processor *this_arch
12182     = aarch64_get_arch (targ_options->x_explicit_arch);
12183
12184   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12185   std::string extension
12186     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12187                                                   this_arch->flags);
12188   /* Only update the assembler .arch string if it is distinct from the last
12189      such string we printed.  */
12190   std::string to_print = this_arch->name + extension;
12191   if (to_print != aarch64_last_printed_arch_string)
12192     {
12193       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12194       aarch64_last_printed_arch_string = to_print;
12195     }
12196
12197   /* Print the cpu name we're tuning for in the comments, might be
12198      useful to readers of the generated asm.  Do it only when it changes
12199      from function to function and verbose assembly is requested.  */
12200   const struct processor *this_tune
12201     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12202
12203   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12204     {
12205       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12206                    this_tune->name);
12207       aarch64_last_printed_tune_string = this_tune->name;
12208     }
12209
12210   /* Don't forget the type directive for ELF.  */
12211   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12212   ASM_OUTPUT_LABEL (stream, name);
12213 }
12214
12215 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12216
12217 static void
12218 aarch64_start_file (void)
12219 {
12220   struct cl_target_option *default_options
12221     = TREE_TARGET_OPTION (target_option_default_node);
12222
12223   const struct processor *default_arch
12224     = aarch64_get_arch (default_options->x_explicit_arch);
12225   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12226   std::string extension
12227     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12228                                                   default_arch->flags);
12229
12230    aarch64_last_printed_arch_string = default_arch->name + extension;
12231    aarch64_last_printed_tune_string = "";
12232    asm_fprintf (asm_out_file, "\t.arch %s\n",
12233                 aarch64_last_printed_arch_string.c_str ());
12234
12235    default_file_start ();
12236 }
12237
12238 /* Emit load exclusive.  */
12239
12240 static void
12241 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12242                              rtx mem, rtx model_rtx)
12243 {
12244   rtx (*gen) (rtx, rtx, rtx);
12245
12246   switch (mode)
12247     {
12248     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12249     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12250     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12251     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12252     default:
12253       gcc_unreachable ();
12254     }
12255
12256   emit_insn (gen (rval, mem, model_rtx));
12257 }
12258
12259 /* Emit store exclusive.  */
12260
12261 static void
12262 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12263                               rtx rval, rtx mem, rtx model_rtx)
12264 {
12265   rtx (*gen) (rtx, rtx, rtx, rtx);
12266
12267   switch (mode)
12268     {
12269     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12270     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12271     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12272     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12273     default:
12274       gcc_unreachable ();
12275     }
12276
12277   emit_insn (gen (bval, rval, mem, model_rtx));
12278 }
12279
12280 /* Mark the previous jump instruction as unlikely.  */
12281
12282 static void
12283 aarch64_emit_unlikely_jump (rtx insn)
12284 {
12285   rtx_insn *jump = emit_jump_insn (insn);
12286   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12287 }
12288
12289 /* Expand a compare and swap pattern.  */
12290
12291 void
12292 aarch64_expand_compare_and_swap (rtx operands[])
12293 {
12294   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12295   machine_mode mode, cmp_mode;
12296   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12297   int idx;
12298   gen_cas_fn gen;
12299   const gen_cas_fn split_cas[] =
12300   {
12301     gen_aarch64_compare_and_swapqi,
12302     gen_aarch64_compare_and_swaphi,
12303     gen_aarch64_compare_and_swapsi,
12304     gen_aarch64_compare_and_swapdi
12305   };
12306   const gen_cas_fn atomic_cas[] =
12307   {
12308     gen_aarch64_compare_and_swapqi_lse,
12309     gen_aarch64_compare_and_swaphi_lse,
12310     gen_aarch64_compare_and_swapsi_lse,
12311     gen_aarch64_compare_and_swapdi_lse
12312   };
12313
12314   bval = operands[0];
12315   rval = operands[1];
12316   mem = operands[2];
12317   oldval = operands[3];
12318   newval = operands[4];
12319   is_weak = operands[5];
12320   mod_s = operands[6];
12321   mod_f = operands[7];
12322   mode = GET_MODE (mem);
12323   cmp_mode = mode;
12324
12325   /* Normally the succ memory model must be stronger than fail, but in the
12326      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12327      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12328
12329   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12330       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12331     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12332
12333   switch (mode)
12334     {
12335     case E_QImode:
12336     case E_HImode:
12337       /* For short modes, we're going to perform the comparison in SImode,
12338          so do the zero-extension now.  */
12339       cmp_mode = SImode;
12340       rval = gen_reg_rtx (SImode);
12341       oldval = convert_modes (SImode, mode, oldval, true);
12342       /* Fall through.  */
12343
12344     case E_SImode:
12345     case E_DImode:
12346       /* Force the value into a register if needed.  */
12347       if (!aarch64_plus_operand (oldval, mode))
12348         oldval = force_reg (cmp_mode, oldval);
12349       break;
12350
12351     default:
12352       gcc_unreachable ();
12353     }
12354
12355   switch (mode)
12356     {
12357     case E_QImode: idx = 0; break;
12358     case E_HImode: idx = 1; break;
12359     case E_SImode: idx = 2; break;
12360     case E_DImode: idx = 3; break;
12361     default:
12362       gcc_unreachable ();
12363     }
12364   if (TARGET_LSE)
12365     gen = atomic_cas[idx];
12366   else
12367     gen = split_cas[idx];
12368
12369   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12370
12371   if (mode == QImode || mode == HImode)
12372     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12373
12374   x = gen_rtx_REG (CCmode, CC_REGNUM);
12375   x = gen_rtx_EQ (SImode, x, const0_rtx);
12376   emit_insn (gen_rtx_SET (bval, x));
12377 }
12378
12379 /* Test whether the target supports using a atomic load-operate instruction.
12380    CODE is the operation and AFTER is TRUE if the data in memory after the
12381    operation should be returned and FALSE if the data before the operation
12382    should be returned.  Returns FALSE if the operation isn't supported by the
12383    architecture.  */
12384
12385 bool
12386 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12387 {
12388   if (!TARGET_LSE)
12389     return false;
12390
12391   switch (code)
12392     {
12393     case SET:
12394     case AND:
12395     case IOR:
12396     case XOR:
12397     case MINUS:
12398     case PLUS:
12399       return true;
12400     default:
12401       return false;
12402     }
12403 }
12404
12405 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12406    sequence implementing an atomic operation.  */
12407
12408 static void
12409 aarch64_emit_post_barrier (enum memmodel model)
12410 {
12411   const enum memmodel base_model = memmodel_base (model);
12412
12413   if (is_mm_sync (model)
12414       && (base_model == MEMMODEL_ACQUIRE
12415           || base_model == MEMMODEL_ACQ_REL
12416           || base_model == MEMMODEL_SEQ_CST))
12417     {
12418       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12419     }
12420 }
12421
12422 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12423    for the data in memory.  EXPECTED is the value expected to be in memory.
12424    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12425    is the memory ordering to use.  */
12426
12427 void
12428 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12429                         rtx expected, rtx desired,
12430                         rtx model)
12431 {
12432   rtx (*gen) (rtx, rtx, rtx, rtx);
12433   machine_mode mode;
12434
12435   mode = GET_MODE (mem);
12436
12437   switch (mode)
12438     {
12439     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12440     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12441     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12442     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12443     default:
12444       gcc_unreachable ();
12445     }
12446
12447   /* Move the expected value into the CAS destination register.  */
12448   emit_insn (gen_rtx_SET (rval, expected));
12449
12450   /* Emit the CAS.  */
12451   emit_insn (gen (rval, mem, desired, model));
12452
12453   /* Compare the expected value with the value loaded by the CAS, to establish
12454      whether the swap was made.  */
12455   aarch64_gen_compare_reg (EQ, rval, expected);
12456 }
12457
12458 /* Split a compare and swap pattern.  */
12459
12460 void
12461 aarch64_split_compare_and_swap (rtx operands[])
12462 {
12463   rtx rval, mem, oldval, newval, scratch;
12464   machine_mode mode;
12465   bool is_weak;
12466   rtx_code_label *label1, *label2;
12467   rtx x, cond;
12468   enum memmodel model;
12469   rtx model_rtx;
12470
12471   rval = operands[0];
12472   mem = operands[1];
12473   oldval = operands[2];
12474   newval = operands[3];
12475   is_weak = (operands[4] != const0_rtx);
12476   model_rtx = operands[5];
12477   scratch = operands[7];
12478   mode = GET_MODE (mem);
12479   model = memmodel_from_int (INTVAL (model_rtx));
12480
12481   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12482     loop:
12483     .label1:
12484         LD[A]XR rval, [mem]
12485         CBNZ    rval, .label2
12486         ST[L]XR scratch, newval, [mem]
12487         CBNZ    scratch, .label1
12488     .label2:
12489         CMP     rval, 0.  */
12490   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12491
12492   label1 = NULL;
12493   if (!is_weak)
12494     {
12495       label1 = gen_label_rtx ();
12496       emit_label (label1);
12497     }
12498   label2 = gen_label_rtx ();
12499
12500   /* The initial load can be relaxed for a __sync operation since a final
12501      barrier will be emitted to stop code hoisting.  */
12502   if (is_mm_sync (model))
12503     aarch64_emit_load_exclusive (mode, rval, mem,
12504                                  GEN_INT (MEMMODEL_RELAXED));
12505   else
12506     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12507
12508   if (strong_zero_p)
12509     {
12510       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12511       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12512                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12513       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12514     }
12515   else
12516     {
12517       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12518       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12519       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12520                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12521       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12522     }
12523
12524   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12525
12526   if (!is_weak)
12527     {
12528       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12529       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12530                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12531       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12532     }
12533   else
12534     {
12535       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12536       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12537       emit_insn (gen_rtx_SET (cond, x));
12538     }
12539
12540   emit_label (label2);
12541   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12542      to set the condition flags.  If this is not used it will be removed by
12543      later passes.  */
12544   if (strong_zero_p)
12545     {
12546       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12547       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12548       emit_insn (gen_rtx_SET (cond, x));
12549     }
12550   /* Emit any final barrier needed for a __sync operation.  */
12551   if (is_mm_sync (model))
12552     aarch64_emit_post_barrier (model);
12553 }
12554
12555 /* Emit a BIC instruction.  */
12556
12557 static void
12558 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12559 {
12560   rtx shift_rtx = GEN_INT (shift);
12561   rtx (*gen) (rtx, rtx, rtx, rtx);
12562
12563   switch (mode)
12564     {
12565     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12566     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12567     default:
12568       gcc_unreachable ();
12569     }
12570
12571   emit_insn (gen (dst, s2, shift_rtx, s1));
12572 }
12573
12574 /* Emit an atomic swap.  */
12575
12576 static void
12577 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12578                           rtx mem, rtx model)
12579 {
12580   rtx (*gen) (rtx, rtx, rtx, rtx);
12581
12582   switch (mode)
12583     {
12584     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12585     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12586     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12587     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12588     default:
12589       gcc_unreachable ();
12590     }
12591
12592   emit_insn (gen (dst, mem, value, model));
12593 }
12594
12595 /* Operations supported by aarch64_emit_atomic_load_op.  */
12596
12597 enum aarch64_atomic_load_op_code
12598 {
12599   AARCH64_LDOP_PLUS,    /* A + B  */
12600   AARCH64_LDOP_XOR,     /* A ^ B  */
12601   AARCH64_LDOP_OR,      /* A | B  */
12602   AARCH64_LDOP_BIC      /* A & ~B  */
12603 };
12604
12605 /* Emit an atomic load-operate.  */
12606
12607 static void
12608 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12609                              machine_mode mode, rtx dst, rtx src,
12610                              rtx mem, rtx model)
12611 {
12612   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12613   const aarch64_atomic_load_op_fn plus[] =
12614   {
12615     gen_aarch64_atomic_loadaddqi,
12616     gen_aarch64_atomic_loadaddhi,
12617     gen_aarch64_atomic_loadaddsi,
12618     gen_aarch64_atomic_loadadddi
12619   };
12620   const aarch64_atomic_load_op_fn eor[] =
12621   {
12622     gen_aarch64_atomic_loadeorqi,
12623     gen_aarch64_atomic_loadeorhi,
12624     gen_aarch64_atomic_loadeorsi,
12625     gen_aarch64_atomic_loadeordi
12626   };
12627   const aarch64_atomic_load_op_fn ior[] =
12628   {
12629     gen_aarch64_atomic_loadsetqi,
12630     gen_aarch64_atomic_loadsethi,
12631     gen_aarch64_atomic_loadsetsi,
12632     gen_aarch64_atomic_loadsetdi
12633   };
12634   const aarch64_atomic_load_op_fn bic[] =
12635   {
12636     gen_aarch64_atomic_loadclrqi,
12637     gen_aarch64_atomic_loadclrhi,
12638     gen_aarch64_atomic_loadclrsi,
12639     gen_aarch64_atomic_loadclrdi
12640   };
12641   aarch64_atomic_load_op_fn gen;
12642   int idx = 0;
12643
12644   switch (mode)
12645     {
12646     case E_QImode: idx = 0; break;
12647     case E_HImode: idx = 1; break;
12648     case E_SImode: idx = 2; break;
12649     case E_DImode: idx = 3; break;
12650     default:
12651       gcc_unreachable ();
12652     }
12653
12654   switch (code)
12655     {
12656     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12657     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12658     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12659     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12660     default:
12661       gcc_unreachable ();
12662     }
12663
12664   emit_insn (gen (dst, mem, src, model));
12665 }
12666
12667 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12668    location to store the data read from memory.  OUT_RESULT is the location to
12669    store the result of the operation.  MEM is the memory location to read and
12670    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12671    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12672    be NULL.  */
12673
12674 void
12675 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12676                          rtx mem, rtx value, rtx model_rtx)
12677 {
12678   machine_mode mode = GET_MODE (mem);
12679   machine_mode wmode = (mode == DImode ? DImode : SImode);
12680   const bool short_mode = (mode < SImode);
12681   aarch64_atomic_load_op_code ldop_code;
12682   rtx src;
12683   rtx x;
12684
12685   if (out_data)
12686     out_data = gen_lowpart (mode, out_data);
12687
12688   if (out_result)
12689     out_result = gen_lowpart (mode, out_result);
12690
12691   /* Make sure the value is in a register, putting it into a destination
12692      register if it needs to be manipulated.  */
12693   if (!register_operand (value, mode)
12694       || code == AND || code == MINUS)
12695     {
12696       src = out_result ? out_result : out_data;
12697       emit_move_insn (src, gen_lowpart (mode, value));
12698     }
12699   else
12700     src = value;
12701   gcc_assert (register_operand (src, mode));
12702
12703   /* Preprocess the data for the operation as necessary.  If the operation is
12704      a SET then emit a swap instruction and finish.  */
12705   switch (code)
12706     {
12707     case SET:
12708       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12709       return;
12710
12711     case MINUS:
12712       /* Negate the value and treat it as a PLUS.  */
12713       {
12714         rtx neg_src;
12715
12716         /* Resize the value if necessary.  */
12717         if (short_mode)
12718           src = gen_lowpart (wmode, src);
12719
12720         neg_src = gen_rtx_NEG (wmode, src);
12721         emit_insn (gen_rtx_SET (src, neg_src));
12722
12723         if (short_mode)
12724           src = gen_lowpart (mode, src);
12725       }
12726       /* Fall-through.  */
12727     case PLUS:
12728       ldop_code = AARCH64_LDOP_PLUS;
12729       break;
12730
12731     case IOR:
12732       ldop_code = AARCH64_LDOP_OR;
12733       break;
12734
12735     case XOR:
12736       ldop_code = AARCH64_LDOP_XOR;
12737       break;
12738
12739     case AND:
12740       {
12741         rtx not_src;
12742
12743         /* Resize the value if necessary.  */
12744         if (short_mode)
12745           src = gen_lowpart (wmode, src);
12746
12747         not_src = gen_rtx_NOT (wmode, src);
12748         emit_insn (gen_rtx_SET (src, not_src));
12749
12750         if (short_mode)
12751           src = gen_lowpart (mode, src);
12752       }
12753       ldop_code = AARCH64_LDOP_BIC;
12754       break;
12755
12756     default:
12757       /* The operation can't be done with atomic instructions.  */
12758       gcc_unreachable ();
12759     }
12760
12761   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12762
12763   /* If necessary, calculate the data in memory after the update by redoing the
12764      operation from values in registers.  */
12765   if (!out_result)
12766     return;
12767
12768   if (short_mode)
12769     {
12770       src = gen_lowpart (wmode, src);
12771       out_data = gen_lowpart (wmode, out_data);
12772       out_result = gen_lowpart (wmode, out_result);
12773     }
12774
12775   x = NULL_RTX;
12776
12777   switch (code)
12778     {
12779     case MINUS:
12780     case PLUS:
12781       x = gen_rtx_PLUS (wmode, out_data, src);
12782       break;
12783     case IOR:
12784       x = gen_rtx_IOR (wmode, out_data, src);
12785       break;
12786     case XOR:
12787       x = gen_rtx_XOR (wmode, out_data, src);
12788       break;
12789     case AND:
12790       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12791       return;
12792     default:
12793       gcc_unreachable ();
12794     }
12795
12796   emit_set_insn (out_result, x);
12797
12798   return;
12799 }
12800
12801 /* Split an atomic operation.  */
12802
12803 void
12804 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12805                          rtx value, rtx model_rtx, rtx cond)
12806 {
12807   machine_mode mode = GET_MODE (mem);
12808   machine_mode wmode = (mode == DImode ? DImode : SImode);
12809   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12810   const bool is_sync = is_mm_sync (model);
12811   rtx_code_label *label;
12812   rtx x;
12813
12814   /* Split the atomic operation into a sequence.  */
12815   label = gen_label_rtx ();
12816   emit_label (label);
12817
12818   if (new_out)
12819     new_out = gen_lowpart (wmode, new_out);
12820   if (old_out)
12821     old_out = gen_lowpart (wmode, old_out);
12822   else
12823     old_out = new_out;
12824   value = simplify_gen_subreg (wmode, value, mode, 0);
12825
12826   /* The initial load can be relaxed for a __sync operation since a final
12827      barrier will be emitted to stop code hoisting.  */
12828  if (is_sync)
12829     aarch64_emit_load_exclusive (mode, old_out, mem,
12830                                  GEN_INT (MEMMODEL_RELAXED));
12831   else
12832     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12833
12834   switch (code)
12835     {
12836     case SET:
12837       new_out = value;
12838       break;
12839
12840     case NOT:
12841       x = gen_rtx_AND (wmode, old_out, value);
12842       emit_insn (gen_rtx_SET (new_out, x));
12843       x = gen_rtx_NOT (wmode, new_out);
12844       emit_insn (gen_rtx_SET (new_out, x));
12845       break;
12846
12847     case MINUS:
12848       if (CONST_INT_P (value))
12849         {
12850           value = GEN_INT (-INTVAL (value));
12851           code = PLUS;
12852         }
12853       /* Fall through.  */
12854
12855     default:
12856       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12857       emit_insn (gen_rtx_SET (new_out, x));
12858       break;
12859     }
12860
12861   aarch64_emit_store_exclusive (mode, cond, mem,
12862                                 gen_lowpart (mode, new_out), model_rtx);
12863
12864   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12865   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12866                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12867   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12868
12869   /* Emit any final barrier needed for a __sync operation.  */
12870   if (is_sync)
12871     aarch64_emit_post_barrier (model);
12872 }
12873
12874 static void
12875 aarch64_init_libfuncs (void)
12876 {
12877    /* Half-precision float operations.  The compiler handles all operations
12878      with NULL libfuncs by converting to SFmode.  */
12879
12880   /* Conversions.  */
12881   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12882   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12883
12884   /* Arithmetic.  */
12885   set_optab_libfunc (add_optab, HFmode, NULL);
12886   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12887   set_optab_libfunc (smul_optab, HFmode, NULL);
12888   set_optab_libfunc (neg_optab, HFmode, NULL);
12889   set_optab_libfunc (sub_optab, HFmode, NULL);
12890
12891   /* Comparisons.  */
12892   set_optab_libfunc (eq_optab, HFmode, NULL);
12893   set_optab_libfunc (ne_optab, HFmode, NULL);
12894   set_optab_libfunc (lt_optab, HFmode, NULL);
12895   set_optab_libfunc (le_optab, HFmode, NULL);
12896   set_optab_libfunc (ge_optab, HFmode, NULL);
12897   set_optab_libfunc (gt_optab, HFmode, NULL);
12898   set_optab_libfunc (unord_optab, HFmode, NULL);
12899 }
12900
12901 /* Target hook for c_mode_for_suffix.  */
12902 static machine_mode
12903 aarch64_c_mode_for_suffix (char suffix)
12904 {
12905   if (suffix == 'q')
12906     return TFmode;
12907
12908   return VOIDmode;
12909 }
12910
12911 /* We can only represent floating point constants which will fit in
12912    "quarter-precision" values.  These values are characterised by
12913    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12914    by:
12915
12916    (-1)^s * (n/16) * 2^r
12917
12918    Where:
12919      's' is the sign bit.
12920      'n' is an integer in the range 16 <= n <= 31.
12921      'r' is an integer in the range -3 <= r <= 4.  */
12922
12923 /* Return true iff X can be represented by a quarter-precision
12924    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12925 bool
12926 aarch64_float_const_representable_p (rtx x)
12927 {
12928   /* This represents our current view of how many bits
12929      make up the mantissa.  */
12930   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12931   int exponent;
12932   unsigned HOST_WIDE_INT mantissa, mask;
12933   REAL_VALUE_TYPE r, m;
12934   bool fail;
12935
12936   if (!CONST_DOUBLE_P (x))
12937     return false;
12938
12939   /* We don't support HFmode constants yet.  */
12940   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12941     return false;
12942
12943   r = *CONST_DOUBLE_REAL_VALUE (x);
12944
12945   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12946      know if we have +zero until we analyse the mantissa, but we
12947      can reject the other invalid values.  */
12948   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12949       || REAL_VALUE_MINUS_ZERO (r))
12950     return false;
12951
12952   /* Extract exponent.  */
12953   r = real_value_abs (&r);
12954   exponent = REAL_EXP (&r);
12955
12956   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12957      highest (sign) bit, with a fixed binary point at bit point_pos.
12958      m1 holds the low part of the mantissa, m2 the high part.
12959      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12960      bits for the mantissa, this can fail (low bits will be lost).  */
12961   real_ldexp (&m, &r, point_pos - exponent);
12962   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12963
12964   /* If the low part of the mantissa has bits set we cannot represent
12965      the value.  */
12966   if (w.ulow () != 0)
12967     return false;
12968   /* We have rejected the lower HOST_WIDE_INT, so update our
12969      understanding of how many bits lie in the mantissa and
12970      look only at the high HOST_WIDE_INT.  */
12971   mantissa = w.elt (1);
12972   point_pos -= HOST_BITS_PER_WIDE_INT;
12973
12974   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12975   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12976   if ((mantissa & mask) != 0)
12977     return false;
12978
12979   /* Having filtered unrepresentable values, we may now remove all
12980      but the highest 5 bits.  */
12981   mantissa >>= point_pos - 5;
12982
12983   /* We cannot represent the value 0.0, so reject it.  This is handled
12984      elsewhere.  */
12985   if (mantissa == 0)
12986     return false;
12987
12988   /* Then, as bit 4 is always set, we can mask it off, leaving
12989      the mantissa in the range [0, 15].  */
12990   mantissa &= ~(1 << 4);
12991   gcc_assert (mantissa <= 15);
12992
12993   /* GCC internally does not use IEEE754-like encoding (where normalized
12994      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12995      Our mantissa values are shifted 4 places to the left relative to
12996      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12997      by 5 places to correct for GCC's representation.  */
12998   exponent = 5 - exponent;
12999
13000   return (exponent >= 0 && exponent <= 7);
13001 }
13002
13003 char*
13004 aarch64_output_simd_mov_immediate (rtx const_vector,
13005                                    machine_mode mode,
13006                                    unsigned width)
13007 {
13008   bool is_valid;
13009   static char templ[40];
13010   const char *mnemonic;
13011   const char *shift_op;
13012   unsigned int lane_count = 0;
13013   char element_char;
13014
13015   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
13016
13017   /* This will return true to show const_vector is legal for use as either
13018      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
13019      also update INFO to show how the immediate should be generated.  */
13020   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
13021   gcc_assert (is_valid);
13022
13023   element_char = sizetochar (info.element_width);
13024   lane_count = width / info.element_width;
13025
13026   mode = GET_MODE_INNER (mode);
13027   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13028     {
13029       gcc_assert (info.shift == 0 && ! info.mvn);
13030       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13031          move immediate path.  */
13032       if (aarch64_float_const_zero_rtx_p (info.value))
13033         info.value = GEN_INT (0);
13034       else
13035         {
13036           const unsigned int buf_size = 20;
13037           char float_buf[buf_size] = {'\0'};
13038           real_to_decimal_for_mode (float_buf,
13039                                     CONST_DOUBLE_REAL_VALUE (info.value),
13040                                     buf_size, buf_size, 1, mode);
13041
13042           if (lane_count == 1)
13043             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13044           else
13045             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13046                       lane_count, element_char, float_buf);
13047           return templ;
13048         }
13049     }
13050
13051   mnemonic = info.mvn ? "mvni" : "movi";
13052   shift_op = info.msl ? "msl" : "lsl";
13053
13054   gcc_assert (CONST_INT_P (info.value));
13055   if (lane_count == 1)
13056     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13057               mnemonic, UINTVAL (info.value));
13058   else if (info.shift)
13059     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13060               ", %s %d", mnemonic, lane_count, element_char,
13061               UINTVAL (info.value), shift_op, info.shift);
13062   else
13063     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13064               mnemonic, lane_count, element_char, UINTVAL (info.value));
13065   return templ;
13066 }
13067
13068 char*
13069 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
13070 {
13071
13072   /* If a floating point number was passed and we desire to use it in an
13073      integer mode do the conversion to integer.  */
13074   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13075     {
13076       unsigned HOST_WIDE_INT ival;
13077       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13078           gcc_unreachable ();
13079       immediate = gen_int_mode (ival, mode);
13080     }
13081
13082   machine_mode vmode;
13083   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13084      a 128 bit vector mode.  */
13085   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13086
13087   vmode = aarch64_simd_container_mode (mode, width);
13088   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13089   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13090 }
13091
13092 /* Split operands into moves from op[1] + op[2] into op[0].  */
13093
13094 void
13095 aarch64_split_combinev16qi (rtx operands[3])
13096 {
13097   unsigned int dest = REGNO (operands[0]);
13098   unsigned int src1 = REGNO (operands[1]);
13099   unsigned int src2 = REGNO (operands[2]);
13100   machine_mode halfmode = GET_MODE (operands[1]);
13101   unsigned int halfregs = REG_NREGS (operands[1]);
13102   rtx destlo, desthi;
13103
13104   gcc_assert (halfmode == V16QImode);
13105
13106   if (src1 == dest && src2 == dest + halfregs)
13107     {
13108       /* No-op move.  Can't split to nothing; emit something.  */
13109       emit_note (NOTE_INSN_DELETED);
13110       return;
13111     }
13112
13113   /* Preserve register attributes for variable tracking.  */
13114   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13115   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13116                                GET_MODE_SIZE (halfmode));
13117
13118   /* Special case of reversed high/low parts.  */
13119   if (reg_overlap_mentioned_p (operands[2], destlo)
13120       && reg_overlap_mentioned_p (operands[1], desthi))
13121     {
13122       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13123       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13124       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13125     }
13126   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13127     {
13128       /* Try to avoid unnecessary moves if part of the result
13129          is in the right place already.  */
13130       if (src1 != dest)
13131         emit_move_insn (destlo, operands[1]);
13132       if (src2 != dest + halfregs)
13133         emit_move_insn (desthi, operands[2]);
13134     }
13135   else
13136     {
13137       if (src2 != dest + halfregs)
13138         emit_move_insn (desthi, operands[2]);
13139       if (src1 != dest)
13140         emit_move_insn (destlo, operands[1]);
13141     }
13142 }
13143
13144 /* vec_perm support.  */
13145
13146 #define MAX_VECT_LEN 16
13147
13148 struct expand_vec_perm_d
13149 {
13150   rtx target, op0, op1;
13151   auto_vec_perm_indices perm;
13152   machine_mode vmode;
13153   bool one_vector_p;
13154   bool testing_p;
13155 };
13156
13157 /* Generate a variable permutation.  */
13158
13159 static void
13160 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13161 {
13162   machine_mode vmode = GET_MODE (target);
13163   bool one_vector_p = rtx_equal_p (op0, op1);
13164
13165   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13166   gcc_checking_assert (GET_MODE (op0) == vmode);
13167   gcc_checking_assert (GET_MODE (op1) == vmode);
13168   gcc_checking_assert (GET_MODE (sel) == vmode);
13169   gcc_checking_assert (TARGET_SIMD);
13170
13171   if (one_vector_p)
13172     {
13173       if (vmode == V8QImode)
13174         {
13175           /* Expand the argument to a V16QI mode by duplicating it.  */
13176           rtx pair = gen_reg_rtx (V16QImode);
13177           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13178           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13179         }
13180       else
13181         {
13182           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13183         }
13184     }
13185   else
13186     {
13187       rtx pair;
13188
13189       if (vmode == V8QImode)
13190         {
13191           pair = gen_reg_rtx (V16QImode);
13192           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13193           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13194         }
13195       else
13196         {
13197           pair = gen_reg_rtx (OImode);
13198           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13199           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13200         }
13201     }
13202 }
13203
13204 void
13205 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13206 {
13207   machine_mode vmode = GET_MODE (target);
13208   unsigned int nelt = GET_MODE_NUNITS (vmode);
13209   bool one_vector_p = rtx_equal_p (op0, op1);
13210   rtx mask;
13211
13212   /* The TBL instruction does not use a modulo index, so we must take care
13213      of that ourselves.  */
13214   mask = aarch64_simd_gen_const_vector_dup (vmode,
13215       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13216   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13217
13218   /* For big-endian, we also need to reverse the index within the vector
13219      (but not which vector).  */
13220   if (BYTES_BIG_ENDIAN)
13221     {
13222       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13223       if (!one_vector_p)
13224         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13225       sel = expand_simple_binop (vmode, XOR, sel, mask,
13226                                  NULL, 0, OPTAB_LIB_WIDEN);
13227     }
13228   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13229 }
13230
13231 /* Recognize patterns suitable for the TRN instructions.  */
13232 static bool
13233 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13234 {
13235   unsigned int i, odd, mask, nelt = d->perm.length ();
13236   rtx out, in0, in1, x;
13237   rtx (*gen) (rtx, rtx, rtx);
13238   machine_mode vmode = d->vmode;
13239
13240   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13241     return false;
13242
13243   /* Note that these are little-endian tests.
13244      We correct for big-endian later.  */
13245   if (d->perm[0] == 0)
13246     odd = 0;
13247   else if (d->perm[0] == 1)
13248     odd = 1;
13249   else
13250     return false;
13251   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13252
13253   for (i = 0; i < nelt; i += 2)
13254     {
13255       if (d->perm[i] != i + odd)
13256         return false;
13257       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13258         return false;
13259     }
13260
13261   /* Success!  */
13262   if (d->testing_p)
13263     return true;
13264
13265   in0 = d->op0;
13266   in1 = d->op1;
13267   if (BYTES_BIG_ENDIAN)
13268     {
13269       x = in0, in0 = in1, in1 = x;
13270       odd = !odd;
13271     }
13272   out = d->target;
13273
13274   if (odd)
13275     {
13276       switch (vmode)
13277         {
13278         case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13279         case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13280         case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13281         case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13282         case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13283         case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13284         case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13285         case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13286         case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13287         case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13288         case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13289         case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13290         default:
13291           return false;
13292         }
13293     }
13294   else
13295     {
13296       switch (vmode)
13297         {
13298         case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13299         case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13300         case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13301         case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13302         case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13303         case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13304         case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13305         case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13306         case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13307         case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13308         case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13309         case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13310         default:
13311           return false;
13312         }
13313     }
13314
13315   emit_insn (gen (out, in0, in1));
13316   return true;
13317 }
13318
13319 /* Recognize patterns suitable for the UZP instructions.  */
13320 static bool
13321 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13322 {
13323   unsigned int i, odd, mask, nelt = d->perm.length ();
13324   rtx out, in0, in1, x;
13325   rtx (*gen) (rtx, rtx, rtx);
13326   machine_mode vmode = d->vmode;
13327
13328   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13329     return false;
13330
13331   /* Note that these are little-endian tests.
13332      We correct for big-endian later.  */
13333   if (d->perm[0] == 0)
13334     odd = 0;
13335   else if (d->perm[0] == 1)
13336     odd = 1;
13337   else
13338     return false;
13339   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13340
13341   for (i = 0; i < nelt; i++)
13342     {
13343       unsigned elt = (i * 2 + odd) & mask;
13344       if (d->perm[i] != elt)
13345         return false;
13346     }
13347
13348   /* Success!  */
13349   if (d->testing_p)
13350     return true;
13351
13352   in0 = d->op0;
13353   in1 = d->op1;
13354   if (BYTES_BIG_ENDIAN)
13355     {
13356       x = in0, in0 = in1, in1 = x;
13357       odd = !odd;
13358     }
13359   out = d->target;
13360
13361   if (odd)
13362     {
13363       switch (vmode)
13364         {
13365         case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13366         case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13367         case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13368         case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13369         case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13370         case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13371         case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13372         case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13373         case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13374         case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13375         case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13376         case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13377         default:
13378           return false;
13379         }
13380     }
13381   else
13382     {
13383       switch (vmode)
13384         {
13385         case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13386         case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13387         case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13388         case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13389         case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13390         case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13391         case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13392         case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13393         case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13394         case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13395         case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13396         case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13397         default:
13398           return false;
13399         }
13400     }
13401
13402   emit_insn (gen (out, in0, in1));
13403   return true;
13404 }
13405
13406 /* Recognize patterns suitable for the ZIP instructions.  */
13407 static bool
13408 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13409 {
13410   unsigned int i, high, mask, nelt = d->perm.length ();
13411   rtx out, in0, in1, x;
13412   rtx (*gen) (rtx, rtx, rtx);
13413   machine_mode vmode = d->vmode;
13414
13415   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13416     return false;
13417
13418   /* Note that these are little-endian tests.
13419      We correct for big-endian later.  */
13420   high = nelt / 2;
13421   if (d->perm[0] == high)
13422     /* Do Nothing.  */
13423     ;
13424   else if (d->perm[0] == 0)
13425     high = 0;
13426   else
13427     return false;
13428   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13429
13430   for (i = 0; i < nelt / 2; i++)
13431     {
13432       unsigned elt = (i + high) & mask;
13433       if (d->perm[i * 2] != elt)
13434         return false;
13435       elt = (elt + nelt) & mask;
13436       if (d->perm[i * 2 + 1] != elt)
13437         return false;
13438     }
13439
13440   /* Success!  */
13441   if (d->testing_p)
13442     return true;
13443
13444   in0 = d->op0;
13445   in1 = d->op1;
13446   if (BYTES_BIG_ENDIAN)
13447     {
13448       x = in0, in0 = in1, in1 = x;
13449       high = !high;
13450     }
13451   out = d->target;
13452
13453   if (high)
13454     {
13455       switch (vmode)
13456         {
13457         case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13458         case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13459         case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13460         case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13461         case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13462         case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13463         case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13464         case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13465         case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13466         case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13467         case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13468         case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13469         default:
13470           return false;
13471         }
13472     }
13473   else
13474     {
13475       switch (vmode)
13476         {
13477         case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13478         case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13479         case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13480         case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13481         case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13482         case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13483         case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13484         case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13485         case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13486         case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13487         case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13488         case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13489         default:
13490           return false;
13491         }
13492     }
13493
13494   emit_insn (gen (out, in0, in1));
13495   return true;
13496 }
13497
13498 /* Recognize patterns for the EXT insn.  */
13499
13500 static bool
13501 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13502 {
13503   unsigned int i, nelt = d->perm.length ();
13504   rtx (*gen) (rtx, rtx, rtx, rtx);
13505   rtx offset;
13506
13507   unsigned int location = d->perm[0]; /* Always < nelt.  */
13508
13509   /* Check if the extracted indices are increasing by one.  */
13510   for (i = 1; i < nelt; i++)
13511     {
13512       unsigned int required = location + i;
13513       if (d->one_vector_p)
13514         {
13515           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13516           required &= (nelt - 1);
13517         }
13518       if (d->perm[i] != required)
13519         return false;
13520     }
13521
13522   switch (d->vmode)
13523     {
13524     case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13525     case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13526     case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13527     case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13528     case E_V2SImode: gen = gen_aarch64_extv2si; break;
13529     case E_V4SImode: gen = gen_aarch64_extv4si; break;
13530     case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13531     case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13532     case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13533     case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13534     case E_V2DImode: gen = gen_aarch64_extv2di; break;
13535     case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13536     default:
13537       return false;
13538     }
13539
13540   /* Success! */
13541   if (d->testing_p)
13542     return true;
13543
13544   /* The case where (location == 0) is a no-op for both big- and little-endian,
13545      and is removed by the mid-end at optimization levels -O1 and higher.  */
13546
13547   if (BYTES_BIG_ENDIAN && (location != 0))
13548     {
13549       /* After setup, we want the high elements of the first vector (stored
13550          at the LSB end of the register), and the low elements of the second
13551          vector (stored at the MSB end of the register). So swap.  */
13552       std::swap (d->op0, d->op1);
13553       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13554       location = nelt - location;
13555     }
13556
13557   offset = GEN_INT (location);
13558   emit_insn (gen (d->target, d->op0, d->op1, offset));
13559   return true;
13560 }
13561
13562 /* Recognize patterns for the REV insns.  */
13563
13564 static bool
13565 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13566 {
13567   unsigned int i, j, diff, nelt = d->perm.length ();
13568   rtx (*gen) (rtx, rtx);
13569
13570   if (!d->one_vector_p)
13571     return false;
13572
13573   diff = d->perm[0];
13574   switch (diff)
13575     {
13576     case 7:
13577       switch (d->vmode)
13578         {
13579         case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13580         case E_V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13581         default:
13582           return false;
13583         }
13584       break;
13585     case 3:
13586       switch (d->vmode)
13587         {
13588         case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13589         case E_V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13590         case E_V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13591         case E_V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13592         default:
13593           return false;
13594         }
13595       break;
13596     case 1:
13597       switch (d->vmode)
13598         {
13599         case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13600         case E_V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13601         case E_V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13602         case E_V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13603         case E_V4SImode: gen = gen_aarch64_rev64v4si;  break;
13604         case E_V2SImode: gen = gen_aarch64_rev64v2si;  break;
13605         case E_V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13606         case E_V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13607         case E_V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13608         case E_V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13609         default:
13610           return false;
13611         }
13612       break;
13613     default:
13614       return false;
13615     }
13616
13617   for (i = 0; i < nelt ; i += diff + 1)
13618     for (j = 0; j <= diff; j += 1)
13619       {
13620         /* This is guaranteed to be true as the value of diff
13621            is 7, 3, 1 and we should have enough elements in the
13622            queue to generate this.  Getting a vector mask with a
13623            value of diff other than these values implies that
13624            something is wrong by the time we get here.  */
13625         gcc_assert (i + j < nelt);
13626         if (d->perm[i + j] != i + diff - j)
13627           return false;
13628       }
13629
13630   /* Success! */
13631   if (d->testing_p)
13632     return true;
13633
13634   emit_insn (gen (d->target, d->op0));
13635   return true;
13636 }
13637
13638 static bool
13639 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13640 {
13641   rtx (*gen) (rtx, rtx, rtx);
13642   rtx out = d->target;
13643   rtx in0;
13644   machine_mode vmode = d->vmode;
13645   unsigned int i, elt, nelt = d->perm.length ();
13646   rtx lane;
13647
13648   elt = d->perm[0];
13649   for (i = 1; i < nelt; i++)
13650     {
13651       if (elt != d->perm[i])
13652         return false;
13653     }
13654
13655   /* The generic preparation in aarch64_expand_vec_perm_const_1
13656      swaps the operand order and the permute indices if it finds
13657      d->perm[0] to be in the second operand.  Thus, we can always
13658      use d->op0 and need not do any extra arithmetic to get the
13659      correct lane number.  */
13660   in0 = d->op0;
13661   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13662
13663   switch (vmode)
13664     {
13665     case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13666     case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13667     case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13668     case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13669     case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13670     case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13671     case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13672     case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13673     case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13674     case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13675     case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13676     case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13677     default:
13678       return false;
13679     }
13680
13681   emit_insn (gen (out, in0, lane));
13682   return true;
13683 }
13684
13685 static bool
13686 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13687 {
13688   rtx rperm[MAX_VECT_LEN], sel;
13689   machine_mode vmode = d->vmode;
13690   unsigned int i, nelt = d->perm.length ();
13691
13692   if (d->testing_p)
13693     return true;
13694
13695   /* Generic code will try constant permutation twice.  Once with the
13696      original mode and again with the elements lowered to QImode.
13697      So wait and don't do the selector expansion ourselves.  */
13698   if (vmode != V8QImode && vmode != V16QImode)
13699     return false;
13700
13701   for (i = 0; i < nelt; ++i)
13702     {
13703       int nunits = GET_MODE_NUNITS (vmode);
13704
13705       /* If big-endian and two vectors we end up with a weird mixed-endian
13706          mode on NEON.  Reverse the index within each word but not the word
13707          itself.  */
13708       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13709                                            : d->perm[i]);
13710     }
13711   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13712   sel = force_reg (vmode, sel);
13713
13714   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13715   return true;
13716 }
13717
13718 static bool
13719 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13720 {
13721   /* The pattern matching functions above are written to look for a small
13722      number to begin the sequence (0, 1, N/2).  If we begin with an index
13723      from the second operand, we can swap the operands.  */
13724   unsigned int nelt = d->perm.length ();
13725   if (d->perm[0] >= nelt)
13726     {
13727       gcc_assert (nelt == (nelt & -nelt));
13728       for (unsigned int i = 0; i < nelt; ++i)
13729         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13730
13731       std::swap (d->op0, d->op1);
13732     }
13733
13734   if (TARGET_SIMD)
13735     {
13736       if (aarch64_evpc_rev (d))
13737         return true;
13738       else if (aarch64_evpc_ext (d))
13739         return true;
13740       else if (aarch64_evpc_dup (d))
13741         return true;
13742       else if (aarch64_evpc_zip (d))
13743         return true;
13744       else if (aarch64_evpc_uzp (d))
13745         return true;
13746       else if (aarch64_evpc_trn (d))
13747         return true;
13748       return aarch64_evpc_tbl (d);
13749     }
13750   return false;
13751 }
13752
13753 /* Expand a vec_perm_const pattern.  */
13754
13755 bool
13756 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13757 {
13758   struct expand_vec_perm_d d;
13759   int i, nelt, which;
13760
13761   d.target = target;
13762   d.op0 = op0;
13763   d.op1 = op1;
13764
13765   d.vmode = GET_MODE (target);
13766   gcc_assert (VECTOR_MODE_P (d.vmode));
13767   d.testing_p = false;
13768
13769   nelt = GET_MODE_NUNITS (d.vmode);
13770   d.perm.reserve (nelt);
13771   for (i = which = 0; i < nelt; ++i)
13772     {
13773       rtx e = XVECEXP (sel, 0, i);
13774       int ei = INTVAL (e) & (2 * nelt - 1);
13775       which |= (ei < nelt ? 1 : 2);
13776       d.perm.quick_push (ei);
13777     }
13778
13779   switch (which)
13780     {
13781     default:
13782       gcc_unreachable ();
13783
13784     case 3:
13785       d.one_vector_p = false;
13786       if (!rtx_equal_p (op0, op1))
13787         break;
13788
13789       /* The elements of PERM do not suggest that only the first operand
13790          is used, but both operands are identical.  Allow easier matching
13791          of the permutation by folding the permutation into the single
13792          input vector.  */
13793       /* Fall Through.  */
13794     case 2:
13795       for (i = 0; i < nelt; ++i)
13796         d.perm[i] &= nelt - 1;
13797       d.op0 = op1;
13798       d.one_vector_p = true;
13799       break;
13800
13801     case 1:
13802       d.op1 = op0;
13803       d.one_vector_p = true;
13804       break;
13805     }
13806
13807   return aarch64_expand_vec_perm_const_1 (&d);
13808 }
13809
13810 static bool
13811 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
13812 {
13813   struct expand_vec_perm_d d;
13814   unsigned int i, nelt, which;
13815   bool ret;
13816
13817   d.vmode = vmode;
13818   d.testing_p = true;
13819   d.perm.safe_splice (sel);
13820
13821   /* Calculate whether all elements are in one vector.  */
13822   nelt = sel.length ();
13823   for (i = which = 0; i < nelt; ++i)
13824     {
13825       unsigned int e = d.perm[i];
13826       gcc_assert (e < 2 * nelt);
13827       which |= (e < nelt ? 1 : 2);
13828     }
13829
13830   /* If all elements are from the second vector, reindex as if from the
13831      first vector.  */
13832   if (which == 2)
13833     for (i = 0; i < nelt; ++i)
13834       d.perm[i] -= nelt;
13835
13836   /* Check whether the mask can be applied to a single vector.  */
13837   d.one_vector_p = (which != 3);
13838
13839   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13840   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13841   if (!d.one_vector_p)
13842     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13843
13844   start_sequence ();
13845   ret = aarch64_expand_vec_perm_const_1 (&d);
13846   end_sequence ();
13847
13848   return ret;
13849 }
13850
13851 rtx
13852 aarch64_reverse_mask (machine_mode mode)
13853 {
13854   /* We have to reverse each vector because we dont have
13855      a permuted load that can reverse-load according to ABI rules.  */
13856   rtx mask;
13857   rtvec v = rtvec_alloc (16);
13858   int i, j;
13859   int nunits = GET_MODE_NUNITS (mode);
13860   int usize = GET_MODE_UNIT_SIZE (mode);
13861
13862   gcc_assert (BYTES_BIG_ENDIAN);
13863   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13864
13865   for (i = 0; i < nunits; i++)
13866     for (j = 0; j < usize; j++)
13867       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13868   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13869   return force_reg (V16QImode, mask);
13870 }
13871
13872 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
13873    true.  However due to issues with register allocation it is preferable
13874    to avoid tieing integer scalar and FP scalar modes.  Executing integer
13875    operations in general registers is better than treating them as scalar
13876    vector operations.  This reduces latency and avoids redundant int<->FP
13877    moves.  So tie modes if they are either the same class, or vector modes
13878    with other vector modes, vector structs or any scalar mode.  */
13879
13880 static bool
13881 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13882 {
13883   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13884     return true;
13885
13886   /* We specifically want to allow elements of "structure" modes to
13887      be tieable to the structure.  This more general condition allows
13888      other rarer situations too.  */
13889   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13890     return true;
13891
13892   /* Also allow any scalar modes with vectors.  */
13893   if (aarch64_vector_mode_supported_p (mode1)
13894       || aarch64_vector_mode_supported_p (mode2))
13895     return true;
13896
13897   return false;
13898 }
13899
13900 /* Return a new RTX holding the result of moving POINTER forward by
13901    AMOUNT bytes.  */
13902
13903 static rtx
13904 aarch64_move_pointer (rtx pointer, int amount)
13905 {
13906   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13907
13908   return adjust_automodify_address (pointer, GET_MODE (pointer),
13909                                     next, amount);
13910 }
13911
13912 /* Return a new RTX holding the result of moving POINTER forward by the
13913    size of the mode it points to.  */
13914
13915 static rtx
13916 aarch64_progress_pointer (rtx pointer)
13917 {
13918   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13919
13920   return aarch64_move_pointer (pointer, amount);
13921 }
13922
13923 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13924    MODE bytes.  */
13925
13926 static void
13927 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13928                                               machine_mode mode)
13929 {
13930   rtx reg = gen_reg_rtx (mode);
13931
13932   /* "Cast" the pointers to the correct mode.  */
13933   *src = adjust_address (*src, mode, 0);
13934   *dst = adjust_address (*dst, mode, 0);
13935   /* Emit the memcpy.  */
13936   emit_move_insn (reg, *src);
13937   emit_move_insn (*dst, reg);
13938   /* Move the pointers forward.  */
13939   *src = aarch64_progress_pointer (*src);
13940   *dst = aarch64_progress_pointer (*dst);
13941 }
13942
13943 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13944    we succeed, otherwise return false.  */
13945
13946 bool
13947 aarch64_expand_movmem (rtx *operands)
13948 {
13949   unsigned int n;
13950   rtx dst = operands[0];
13951   rtx src = operands[1];
13952   rtx base;
13953   bool speed_p = !optimize_function_for_size_p (cfun);
13954
13955   /* When optimizing for size, give a better estimate of the length of a
13956      memcpy call, but use the default otherwise.  */
13957   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13958
13959   /* We can't do anything smart if the amount to copy is not constant.  */
13960   if (!CONST_INT_P (operands[2]))
13961     return false;
13962
13963   n = UINTVAL (operands[2]);
13964
13965   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13966      need to make at most two moves.  For cases above 16 bytes it will be one
13967      move for each 16 byte chunk, then at most two additional moves.  */
13968   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13969     return false;
13970
13971   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13972   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13973
13974   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13975   src = adjust_automodify_address (src, VOIDmode, base, 0);
13976
13977   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13978      1-byte chunk.  */
13979   if (n < 4)
13980     {
13981       if (n >= 2)
13982         {
13983           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13984           n -= 2;
13985         }
13986
13987       if (n == 1)
13988         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13989
13990       return true;
13991     }
13992
13993   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13994      4-byte chunk, partially overlapping with the previously copied chunk.  */
13995   if (n < 8)
13996     {
13997       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13998       n -= 4;
13999       if (n > 0)
14000         {
14001           int move = n - 4;
14002
14003           src = aarch64_move_pointer (src, move);
14004           dst = aarch64_move_pointer (dst, move);
14005           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14006         }
14007       return true;
14008     }
14009
14010   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
14011      them, then (if applicable) an 8-byte chunk.  */
14012   while (n >= 8)
14013     {
14014       if (n / 16)
14015         {
14016           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
14017           n -= 16;
14018         }
14019       else
14020         {
14021           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14022           n -= 8;
14023         }
14024     }
14025
14026   /* Finish the final bytes of the copy.  We can always do this in one
14027      instruction.  We either copy the exact amount we need, or partially
14028      overlap with the previous chunk we copied and copy 8-bytes.  */
14029   if (n == 0)
14030     return true;
14031   else if (n == 1)
14032     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14033   else if (n == 2)
14034     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14035   else if (n == 4)
14036     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14037   else
14038     {
14039       if (n == 3)
14040         {
14041           src = aarch64_move_pointer (src, -1);
14042           dst = aarch64_move_pointer (dst, -1);
14043           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14044         }
14045       else
14046         {
14047           int move = n - 8;
14048
14049           src = aarch64_move_pointer (src, move);
14050           dst = aarch64_move_pointer (dst, move);
14051           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14052         }
14053     }
14054
14055   return true;
14056 }
14057
14058 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14059    SImode stores.  Handle the case when the constant has identical
14060    bottom and top halves.  This is beneficial when the two stores can be
14061    merged into an STP and we avoid synthesising potentially expensive
14062    immediates twice.  Return true if such a split is possible.  */
14063
14064 bool
14065 aarch64_split_dimode_const_store (rtx dst, rtx src)
14066 {
14067   rtx lo = gen_lowpart (SImode, src);
14068   rtx hi = gen_highpart_mode (SImode, DImode, src);
14069
14070   bool size_p = optimize_function_for_size_p (cfun);
14071
14072   if (!rtx_equal_p (lo, hi))
14073     return false;
14074
14075   unsigned int orig_cost
14076     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14077   unsigned int lo_cost
14078     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14079
14080   /* We want to transform:
14081      MOV        x1, 49370
14082      MOVK       x1, 0x140, lsl 16
14083      MOVK       x1, 0xc0da, lsl 32
14084      MOVK       x1, 0x140, lsl 48
14085      STR        x1, [x0]
14086    into:
14087      MOV        w1, 49370
14088      MOVK       w1, 0x140, lsl 16
14089      STP        w1, w1, [x0]
14090    So we want to perform this only when we save two instructions
14091    or more.  When optimizing for size, however, accept any code size
14092    savings we can.  */
14093   if (size_p && orig_cost <= lo_cost)
14094     return false;
14095
14096   if (!size_p
14097       && (orig_cost <= lo_cost + 1))
14098     return false;
14099
14100   rtx mem_lo = adjust_address (dst, SImode, 0);
14101   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14102     return false;
14103
14104   rtx tmp_reg = gen_reg_rtx (SImode);
14105   aarch64_expand_mov_immediate (tmp_reg, lo);
14106   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14107   /* Don't emit an explicit store pair as this may not be always profitable.
14108      Let the sched-fusion logic decide whether to merge them.  */
14109   emit_move_insn (mem_lo, tmp_reg);
14110   emit_move_insn (mem_hi, tmp_reg);
14111
14112   return true;
14113 }
14114
14115 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14116
14117 static unsigned HOST_WIDE_INT
14118 aarch64_asan_shadow_offset (void)
14119 {
14120   return (HOST_WIDE_INT_1 << 36);
14121 }
14122
14123 static bool
14124 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14125                                         unsigned int align,
14126                                         enum by_pieces_operation op,
14127                                         bool speed_p)
14128 {
14129   /* STORE_BY_PIECES can be used when copying a constant string, but
14130      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14131      For now we always fail this and let the move_by_pieces code copy
14132      the string from read-only memory.  */
14133   if (op == STORE_BY_PIECES)
14134     return false;
14135
14136   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14137 }
14138
14139 static rtx
14140 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14141                         int code, tree treeop0, tree treeop1)
14142 {
14143   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14144   rtx op0, op1;
14145   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14146   insn_code icode;
14147   struct expand_operand ops[4];
14148
14149   start_sequence ();
14150   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14151
14152   op_mode = GET_MODE (op0);
14153   if (op_mode == VOIDmode)
14154     op_mode = GET_MODE (op1);
14155
14156   switch (op_mode)
14157     {
14158     case E_QImode:
14159     case E_HImode:
14160     case E_SImode:
14161       cmp_mode = SImode;
14162       icode = CODE_FOR_cmpsi;
14163       break;
14164
14165     case E_DImode:
14166       cmp_mode = DImode;
14167       icode = CODE_FOR_cmpdi;
14168       break;
14169
14170     case E_SFmode:
14171       cmp_mode = SFmode;
14172       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14173       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14174       break;
14175
14176     case E_DFmode:
14177       cmp_mode = DFmode;
14178       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14179       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14180       break;
14181
14182     default:
14183       end_sequence ();
14184       return NULL_RTX;
14185     }
14186
14187   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14188   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14189   if (!op0 || !op1)
14190     {
14191       end_sequence ();
14192       return NULL_RTX;
14193     }
14194   *prep_seq = get_insns ();
14195   end_sequence ();
14196
14197   create_fixed_operand (&ops[0], op0);
14198   create_fixed_operand (&ops[1], op1);
14199
14200   start_sequence ();
14201   if (!maybe_expand_insn (icode, 2, ops))
14202     {
14203       end_sequence ();
14204       return NULL_RTX;
14205     }
14206   *gen_seq = get_insns ();
14207   end_sequence ();
14208
14209   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14210                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14211 }
14212
14213 static rtx
14214 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14215                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14216 {
14217   rtx op0, op1, target;
14218   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14219   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14220   insn_code icode;
14221   struct expand_operand ops[6];
14222   int aarch64_cond;
14223
14224   push_to_sequence (*prep_seq);
14225   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14226
14227   op_mode = GET_MODE (op0);
14228   if (op_mode == VOIDmode)
14229     op_mode = GET_MODE (op1);
14230
14231   switch (op_mode)
14232     {
14233     case E_QImode:
14234     case E_HImode:
14235     case E_SImode:
14236       cmp_mode = SImode;
14237       icode = CODE_FOR_ccmpsi;
14238       break;
14239
14240     case E_DImode:
14241       cmp_mode = DImode;
14242       icode = CODE_FOR_ccmpdi;
14243       break;
14244
14245     case E_SFmode:
14246       cmp_mode = SFmode;
14247       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14248       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14249       break;
14250
14251     case E_DFmode:
14252       cmp_mode = DFmode;
14253       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14254       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14255       break;
14256
14257     default:
14258       end_sequence ();
14259       return NULL_RTX;
14260     }
14261
14262   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14263   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14264   if (!op0 || !op1)
14265     {
14266       end_sequence ();
14267       return NULL_RTX;
14268     }
14269   *prep_seq = get_insns ();
14270   end_sequence ();
14271
14272   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14273   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14274
14275   if (bit_code != AND)
14276     {
14277       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14278                                                 GET_MODE (XEXP (prev, 0))),
14279                              VOIDmode, XEXP (prev, 0), const0_rtx);
14280       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14281     }
14282
14283   create_fixed_operand (&ops[0], XEXP (prev, 0));
14284   create_fixed_operand (&ops[1], target);
14285   create_fixed_operand (&ops[2], op0);
14286   create_fixed_operand (&ops[3], op1);
14287   create_fixed_operand (&ops[4], prev);
14288   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14289
14290   push_to_sequence (*gen_seq);
14291   if (!maybe_expand_insn (icode, 6, ops))
14292     {
14293       end_sequence ();
14294       return NULL_RTX;
14295     }
14296
14297   *gen_seq = get_insns ();
14298   end_sequence ();
14299
14300   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14301 }
14302
14303 #undef TARGET_GEN_CCMP_FIRST
14304 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14305
14306 #undef TARGET_GEN_CCMP_NEXT
14307 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14308
14309 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14310    instruction fusion of some sort.  */
14311
14312 static bool
14313 aarch64_macro_fusion_p (void)
14314 {
14315   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14316 }
14317
14318
14319 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14320    should be kept together during scheduling.  */
14321
14322 static bool
14323 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14324 {
14325   rtx set_dest;
14326   rtx prev_set = single_set (prev);
14327   rtx curr_set = single_set (curr);
14328   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14329   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14330
14331   if (!aarch64_macro_fusion_p ())
14332     return false;
14333
14334   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14335     {
14336       /* We are trying to match:
14337          prev (mov)  == (set (reg r0) (const_int imm16))
14338          curr (movk) == (set (zero_extract (reg r0)
14339                                            (const_int 16)
14340                                            (const_int 16))
14341                              (const_int imm16_1))  */
14342
14343       set_dest = SET_DEST (curr_set);
14344
14345       if (GET_CODE (set_dest) == ZERO_EXTRACT
14346           && CONST_INT_P (SET_SRC (curr_set))
14347           && CONST_INT_P (SET_SRC (prev_set))
14348           && CONST_INT_P (XEXP (set_dest, 2))
14349           && INTVAL (XEXP (set_dest, 2)) == 16
14350           && REG_P (XEXP (set_dest, 0))
14351           && REG_P (SET_DEST (prev_set))
14352           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14353         {
14354           return true;
14355         }
14356     }
14357
14358   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14359     {
14360
14361       /*  We're trying to match:
14362           prev (adrp) == (set (reg r1)
14363                               (high (symbol_ref ("SYM"))))
14364           curr (add) == (set (reg r0)
14365                              (lo_sum (reg r1)
14366                                      (symbol_ref ("SYM"))))
14367           Note that r0 need not necessarily be the same as r1, especially
14368           during pre-regalloc scheduling.  */
14369
14370       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14371           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14372         {
14373           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14374               && REG_P (XEXP (SET_SRC (curr_set), 0))
14375               && REGNO (XEXP (SET_SRC (curr_set), 0))
14376                  == REGNO (SET_DEST (prev_set))
14377               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14378                               XEXP (SET_SRC (curr_set), 1)))
14379             return true;
14380         }
14381     }
14382
14383   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14384     {
14385
14386       /* We're trying to match:
14387          prev (movk) == (set (zero_extract (reg r0)
14388                                            (const_int 16)
14389                                            (const_int 32))
14390                              (const_int imm16_1))
14391          curr (movk) == (set (zero_extract (reg r0)
14392                                            (const_int 16)
14393                                            (const_int 48))
14394                              (const_int imm16_2))  */
14395
14396       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14397           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14398           && REG_P (XEXP (SET_DEST (prev_set), 0))
14399           && REG_P (XEXP (SET_DEST (curr_set), 0))
14400           && REGNO (XEXP (SET_DEST (prev_set), 0))
14401              == REGNO (XEXP (SET_DEST (curr_set), 0))
14402           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14403           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14404           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14405           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14406           && CONST_INT_P (SET_SRC (prev_set))
14407           && CONST_INT_P (SET_SRC (curr_set)))
14408         return true;
14409
14410     }
14411   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14412     {
14413       /* We're trying to match:
14414           prev (adrp) == (set (reg r0)
14415                               (high (symbol_ref ("SYM"))))
14416           curr (ldr) == (set (reg r1)
14417                              (mem (lo_sum (reg r0)
14418                                              (symbol_ref ("SYM")))))
14419                  or
14420           curr (ldr) == (set (reg r1)
14421                              (zero_extend (mem
14422                                            (lo_sum (reg r0)
14423                                                    (symbol_ref ("SYM"))))))  */
14424       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14425           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14426         {
14427           rtx curr_src = SET_SRC (curr_set);
14428
14429           if (GET_CODE (curr_src) == ZERO_EXTEND)
14430             curr_src = XEXP (curr_src, 0);
14431
14432           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14433               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14434               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14435                  == REGNO (SET_DEST (prev_set))
14436               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14437                               XEXP (SET_SRC (prev_set), 0)))
14438               return true;
14439         }
14440     }
14441
14442   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14443        && aarch_crypto_can_dual_issue (prev, curr))
14444     return true;
14445
14446   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14447       && any_condjump_p (curr))
14448     {
14449       enum attr_type prev_type = get_attr_type (prev);
14450
14451       unsigned int condreg1, condreg2;
14452       rtx cc_reg_1;
14453       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14454       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14455
14456       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14457           && prev
14458           && modified_in_p (cc_reg_1, prev))
14459         {
14460           /* FIXME: this misses some which is considered simple arthematic
14461              instructions for ThunderX.  Simple shifts are missed here.  */
14462           if (prev_type == TYPE_ALUS_SREG
14463               || prev_type == TYPE_ALUS_IMM
14464               || prev_type == TYPE_LOGICS_REG
14465               || prev_type == TYPE_LOGICS_IMM)
14466             return true;
14467         }
14468     }
14469
14470   if (prev_set
14471       && curr_set
14472       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14473       && any_condjump_p (curr))
14474     {
14475       /* We're trying to match:
14476           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14477           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14478                                                          (const_int 0))
14479                                                  (label_ref ("SYM"))
14480                                                  (pc))  */
14481       if (SET_DEST (curr_set) == (pc_rtx)
14482           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14483           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14484           && REG_P (SET_DEST (prev_set))
14485           && REGNO (SET_DEST (prev_set))
14486              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14487         {
14488           /* Fuse ALU operations followed by conditional branch instruction.  */
14489           switch (get_attr_type (prev))
14490             {
14491             case TYPE_ALU_IMM:
14492             case TYPE_ALU_SREG:
14493             case TYPE_ADC_REG:
14494             case TYPE_ADC_IMM:
14495             case TYPE_ADCS_REG:
14496             case TYPE_ADCS_IMM:
14497             case TYPE_LOGIC_REG:
14498             case TYPE_LOGIC_IMM:
14499             case TYPE_CSEL:
14500             case TYPE_ADR:
14501             case TYPE_MOV_IMM:
14502             case TYPE_SHIFT_REG:
14503             case TYPE_SHIFT_IMM:
14504             case TYPE_BFM:
14505             case TYPE_RBIT:
14506             case TYPE_REV:
14507             case TYPE_EXTEND:
14508               return true;
14509
14510             default:;
14511             }
14512         }
14513     }
14514
14515   return false;
14516 }
14517
14518 /* Return true iff the instruction fusion described by OP is enabled.  */
14519
14520 bool
14521 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14522 {
14523   return (aarch64_tune_params.fusible_ops & op) != 0;
14524 }
14525
14526 /* If MEM is in the form of [base+offset], extract the two parts
14527    of address and set to BASE and OFFSET, otherwise return false
14528    after clearing BASE and OFFSET.  */
14529
14530 bool
14531 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14532 {
14533   rtx addr;
14534
14535   gcc_assert (MEM_P (mem));
14536
14537   addr = XEXP (mem, 0);
14538
14539   if (REG_P (addr))
14540     {
14541       *base = addr;
14542       *offset = const0_rtx;
14543       return true;
14544     }
14545
14546   if (GET_CODE (addr) == PLUS
14547       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14548     {
14549       *base = XEXP (addr, 0);
14550       *offset = XEXP (addr, 1);
14551       return true;
14552     }
14553
14554   *base = NULL_RTX;
14555   *offset = NULL_RTX;
14556
14557   return false;
14558 }
14559
14560 /* Types for scheduling fusion.  */
14561 enum sched_fusion_type
14562 {
14563   SCHED_FUSION_NONE = 0,
14564   SCHED_FUSION_LD_SIGN_EXTEND,
14565   SCHED_FUSION_LD_ZERO_EXTEND,
14566   SCHED_FUSION_LD,
14567   SCHED_FUSION_ST,
14568   SCHED_FUSION_NUM
14569 };
14570
14571 /* If INSN is a load or store of address in the form of [base+offset],
14572    extract the two parts and set to BASE and OFFSET.  Return scheduling
14573    fusion type this INSN is.  */
14574
14575 static enum sched_fusion_type
14576 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14577 {
14578   rtx x, dest, src;
14579   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14580
14581   gcc_assert (INSN_P (insn));
14582   x = PATTERN (insn);
14583   if (GET_CODE (x) != SET)
14584     return SCHED_FUSION_NONE;
14585
14586   src = SET_SRC (x);
14587   dest = SET_DEST (x);
14588
14589   machine_mode dest_mode = GET_MODE (dest);
14590
14591   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14592     return SCHED_FUSION_NONE;
14593
14594   if (GET_CODE (src) == SIGN_EXTEND)
14595     {
14596       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14597       src = XEXP (src, 0);
14598       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14599         return SCHED_FUSION_NONE;
14600     }
14601   else if (GET_CODE (src) == ZERO_EXTEND)
14602     {
14603       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14604       src = XEXP (src, 0);
14605       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14606         return SCHED_FUSION_NONE;
14607     }
14608
14609   if (GET_CODE (src) == MEM && REG_P (dest))
14610     extract_base_offset_in_addr (src, base, offset);
14611   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14612     {
14613       fusion = SCHED_FUSION_ST;
14614       extract_base_offset_in_addr (dest, base, offset);
14615     }
14616   else
14617     return SCHED_FUSION_NONE;
14618
14619   if (*base == NULL_RTX || *offset == NULL_RTX)
14620     fusion = SCHED_FUSION_NONE;
14621
14622   return fusion;
14623 }
14624
14625 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14626
14627    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14628    and PRI are only calculated for these instructions.  For other instruction,
14629    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14630    type instruction fusion can be added by returning different priorities.
14631
14632    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14633
14634 static void
14635 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14636                                int *fusion_pri, int *pri)
14637 {
14638   int tmp, off_val;
14639   rtx base, offset;
14640   enum sched_fusion_type fusion;
14641
14642   gcc_assert (INSN_P (insn));
14643
14644   tmp = max_pri - 1;
14645   fusion = fusion_load_store (insn, &base, &offset);
14646   if (fusion == SCHED_FUSION_NONE)
14647     {
14648       *pri = tmp;
14649       *fusion_pri = tmp;
14650       return;
14651     }
14652
14653   /* Set FUSION_PRI according to fusion type and base register.  */
14654   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14655
14656   /* Calculate PRI.  */
14657   tmp /= 2;
14658
14659   /* INSN with smaller offset goes first.  */
14660   off_val = (int)(INTVAL (offset));
14661   if (off_val >= 0)
14662     tmp -= (off_val & 0xfffff);
14663   else
14664     tmp += ((- off_val) & 0xfffff);
14665
14666   *pri = tmp;
14667   return;
14668 }
14669
14670 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14671    Adjust priority of sha1h instructions so they are scheduled before
14672    other SHA1 instructions.  */
14673
14674 static int
14675 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14676 {
14677   rtx x = PATTERN (insn);
14678
14679   if (GET_CODE (x) == SET)
14680     {
14681       x = SET_SRC (x);
14682
14683       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14684         return priority + 10;
14685     }
14686
14687   return priority;
14688 }
14689
14690 /* Given OPERANDS of consecutive load/store, check if we can merge
14691    them into ldp/stp.  LOAD is true if they are load instructions.
14692    MODE is the mode of memory operands.  */
14693
14694 bool
14695 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14696                                 machine_mode mode)
14697 {
14698   HOST_WIDE_INT offval_1, offval_2, msize;
14699   enum reg_class rclass_1, rclass_2;
14700   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14701
14702   if (load)
14703     {
14704       mem_1 = operands[1];
14705       mem_2 = operands[3];
14706       reg_1 = operands[0];
14707       reg_2 = operands[2];
14708       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14709       if (REGNO (reg_1) == REGNO (reg_2))
14710         return false;
14711     }
14712   else
14713     {
14714       mem_1 = operands[0];
14715       mem_2 = operands[2];
14716       reg_1 = operands[1];
14717       reg_2 = operands[3];
14718     }
14719
14720   /* The mems cannot be volatile.  */
14721   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14722     return false;
14723
14724   /* If we have SImode and slow unaligned ldp,
14725      check the alignment to be at least 8 byte. */
14726   if (mode == SImode
14727       && (aarch64_tune_params.extra_tuning_flags
14728           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14729       && !optimize_size
14730       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14731     return false;
14732
14733   /* Check if the addresses are in the form of [base+offset].  */
14734   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14735   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14736     return false;
14737   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14738   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14739     return false;
14740
14741   /* Check if the bases are same.  */
14742   if (!rtx_equal_p (base_1, base_2))
14743     return false;
14744
14745   offval_1 = INTVAL (offset_1);
14746   offval_2 = INTVAL (offset_2);
14747   msize = GET_MODE_SIZE (mode);
14748   /* Check if the offsets are consecutive.  */
14749   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14750     return false;
14751
14752   /* Check if the addresses are clobbered by load.  */
14753   if (load)
14754     {
14755       if (reg_mentioned_p (reg_1, mem_1))
14756         return false;
14757
14758       /* In increasing order, the last load can clobber the address.  */
14759       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14760       return false;
14761     }
14762
14763   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14764     rclass_1 = FP_REGS;
14765   else
14766     rclass_1 = GENERAL_REGS;
14767
14768   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14769     rclass_2 = FP_REGS;
14770   else
14771     rclass_2 = GENERAL_REGS;
14772
14773   /* Check if the registers are of same class.  */
14774   if (rclass_1 != rclass_2)
14775     return false;
14776
14777   return true;
14778 }
14779
14780 /* Given OPERANDS of consecutive load/store, check if we can merge
14781    them into ldp/stp by adjusting the offset.  LOAD is true if they
14782    are load instructions.  MODE is the mode of memory operands.
14783
14784    Given below consecutive stores:
14785
14786      str  w1, [xb, 0x100]
14787      str  w1, [xb, 0x104]
14788      str  w1, [xb, 0x108]
14789      str  w1, [xb, 0x10c]
14790
14791    Though the offsets are out of the range supported by stp, we can
14792    still pair them after adjusting the offset, like:
14793
14794      add  scratch, xb, 0x100
14795      stp  w1, w1, [scratch]
14796      stp  w1, w1, [scratch, 0x8]
14797
14798    The peephole patterns detecting this opportunity should guarantee
14799    the scratch register is avaliable.  */
14800
14801 bool
14802 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14803                                        scalar_mode mode)
14804 {
14805   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14806   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14807   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14808   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14809
14810   if (load)
14811     {
14812       reg_1 = operands[0];
14813       mem_1 = operands[1];
14814       reg_2 = operands[2];
14815       mem_2 = operands[3];
14816       reg_3 = operands[4];
14817       mem_3 = operands[5];
14818       reg_4 = operands[6];
14819       mem_4 = operands[7];
14820       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14821                   && REG_P (reg_3) && REG_P (reg_4));
14822       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14823         return false;
14824     }
14825   else
14826     {
14827       mem_1 = operands[0];
14828       reg_1 = operands[1];
14829       mem_2 = operands[2];
14830       reg_2 = operands[3];
14831       mem_3 = operands[4];
14832       reg_3 = operands[5];
14833       mem_4 = operands[6];
14834       reg_4 = operands[7];
14835     }
14836   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14837   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14838     return false;
14839
14840   /* The mems cannot be volatile.  */
14841   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14842       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14843     return false;
14844
14845   /* Check if the addresses are in the form of [base+offset].  */
14846   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14847   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14848     return false;
14849   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14850   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14851     return false;
14852   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14853   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14854     return false;
14855   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14856   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14857     return false;
14858
14859   /* Check if the bases are same.  */
14860   if (!rtx_equal_p (base_1, base_2)
14861       || !rtx_equal_p (base_2, base_3)
14862       || !rtx_equal_p (base_3, base_4))
14863     return false;
14864
14865   offval_1 = INTVAL (offset_1);
14866   offval_2 = INTVAL (offset_2);
14867   offval_3 = INTVAL (offset_3);
14868   offval_4 = INTVAL (offset_4);
14869   msize = GET_MODE_SIZE (mode);
14870   /* Check if the offsets are consecutive.  */
14871   if ((offval_1 != (offval_2 + msize)
14872        || offval_1 != (offval_3 + msize * 2)
14873        || offval_1 != (offval_4 + msize * 3))
14874       && (offval_4 != (offval_3 + msize)
14875           || offval_4 != (offval_2 + msize * 2)
14876           || offval_4 != (offval_1 + msize * 3)))
14877     return false;
14878
14879   /* Check if the addresses are clobbered by load.  */
14880   if (load)
14881     {
14882       if (reg_mentioned_p (reg_1, mem_1)
14883           || reg_mentioned_p (reg_2, mem_2)
14884           || reg_mentioned_p (reg_3, mem_3))
14885         return false;
14886
14887       /* In increasing order, the last load can clobber the address.  */
14888       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14889         return false;
14890     }
14891
14892   /* If we have SImode and slow unaligned ldp,
14893      check the alignment to be at least 8 byte. */
14894   if (mode == SImode
14895       && (aarch64_tune_params.extra_tuning_flags
14896           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14897       && !optimize_size
14898       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14899     return false;
14900
14901   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14902     rclass_1 = FP_REGS;
14903   else
14904     rclass_1 = GENERAL_REGS;
14905
14906   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14907     rclass_2 = FP_REGS;
14908   else
14909     rclass_2 = GENERAL_REGS;
14910
14911   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14912     rclass_3 = FP_REGS;
14913   else
14914     rclass_3 = GENERAL_REGS;
14915
14916   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14917     rclass_4 = FP_REGS;
14918   else
14919     rclass_4 = GENERAL_REGS;
14920
14921   /* Check if the registers are of same class.  */
14922   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14923     return false;
14924
14925   return true;
14926 }
14927
14928 /* Given OPERANDS of consecutive load/store, this function pairs them
14929    into ldp/stp after adjusting the offset.  It depends on the fact
14930    that addresses of load/store instructions are in increasing order.
14931    MODE is the mode of memory operands.  CODE is the rtl operator
14932    which should be applied to all memory operands, it's SIGN_EXTEND,
14933    ZERO_EXTEND or UNKNOWN.  */
14934
14935 bool
14936 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14937                              scalar_mode mode, RTX_CODE code)
14938 {
14939   rtx base, offset, t1, t2;
14940   rtx mem_1, mem_2, mem_3, mem_4;
14941   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14942
14943   if (load)
14944     {
14945       mem_1 = operands[1];
14946       mem_2 = operands[3];
14947       mem_3 = operands[5];
14948       mem_4 = operands[7];
14949     }
14950   else
14951     {
14952       mem_1 = operands[0];
14953       mem_2 = operands[2];
14954       mem_3 = operands[4];
14955       mem_4 = operands[6];
14956       gcc_assert (code == UNKNOWN);
14957     }
14958
14959   extract_base_offset_in_addr (mem_1, &base, &offset);
14960   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14961
14962   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14963   msize = GET_MODE_SIZE (mode);
14964   stp_off_limit = msize * 0x40;
14965   off_val = INTVAL (offset);
14966   abs_off = (off_val < 0) ? -off_val : off_val;
14967   new_off = abs_off % stp_off_limit;
14968   adj_off = abs_off - new_off;
14969
14970   /* Further adjust to make sure all offsets are OK.  */
14971   if ((new_off + msize * 2) >= stp_off_limit)
14972     {
14973       adj_off += stp_off_limit;
14974       new_off -= stp_off_limit;
14975     }
14976
14977   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14978   if (adj_off >= 0x1000)
14979     return false;
14980
14981   if (off_val < 0)
14982     {
14983       adj_off = -adj_off;
14984       new_off = -new_off;
14985     }
14986
14987   /* Create new memory references.  */
14988   mem_1 = change_address (mem_1, VOIDmode,
14989                           plus_constant (DImode, operands[8], new_off));
14990
14991   /* Check if the adjusted address is OK for ldp/stp.  */
14992   if (!aarch64_mem_pair_operand (mem_1, mode))
14993     return false;
14994
14995   msize = GET_MODE_SIZE (mode);
14996   mem_2 = change_address (mem_2, VOIDmode,
14997                           plus_constant (DImode,
14998                                          operands[8],
14999                                          new_off + msize));
15000   mem_3 = change_address (mem_3, VOIDmode,
15001                           plus_constant (DImode,
15002                                          operands[8],
15003                                          new_off + msize * 2));
15004   mem_4 = change_address (mem_4, VOIDmode,
15005                           plus_constant (DImode,
15006                                          operands[8],
15007                                          new_off + msize * 3));
15008
15009   if (code == ZERO_EXTEND)
15010     {
15011       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
15012       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
15013       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
15014       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
15015     }
15016   else if (code == SIGN_EXTEND)
15017     {
15018       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
15019       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
15020       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
15021       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
15022     }
15023
15024   if (load)
15025     {
15026       operands[1] = mem_1;
15027       operands[3] = mem_2;
15028       operands[5] = mem_3;
15029       operands[7] = mem_4;
15030     }
15031   else
15032     {
15033       operands[0] = mem_1;
15034       operands[2] = mem_2;
15035       operands[4] = mem_3;
15036       operands[6] = mem_4;
15037     }
15038
15039   /* Emit adjusting instruction.  */
15040   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15041   /* Emit ldp/stp instructions.  */
15042   t1 = gen_rtx_SET (operands[0], operands[1]);
15043   t2 = gen_rtx_SET (operands[2], operands[3]);
15044   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15045   t1 = gen_rtx_SET (operands[4], operands[5]);
15046   t2 = gen_rtx_SET (operands[6], operands[7]);
15047   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15048   return true;
15049 }
15050
15051 /* Return 1 if pseudo register should be created and used to hold
15052    GOT address for PIC code.  */
15053
15054 bool
15055 aarch64_use_pseudo_pic_reg (void)
15056 {
15057   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15058 }
15059
15060 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15061
15062 static int
15063 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15064 {
15065   switch (XINT (x, 1))
15066     {
15067     case UNSPEC_GOTSMALLPIC:
15068     case UNSPEC_GOTSMALLPIC28K:
15069     case UNSPEC_GOTTINYPIC:
15070       return 0;
15071     default:
15072       break;
15073     }
15074
15075   return default_unspec_may_trap_p (x, flags);
15076 }
15077
15078
15079 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15080    return the log2 of that value.  Otherwise return -1.  */
15081
15082 int
15083 aarch64_fpconst_pow_of_2 (rtx x)
15084 {
15085   const REAL_VALUE_TYPE *r;
15086
15087   if (!CONST_DOUBLE_P (x))
15088     return -1;
15089
15090   r = CONST_DOUBLE_REAL_VALUE (x);
15091
15092   if (REAL_VALUE_NEGATIVE (*r)
15093       || REAL_VALUE_ISNAN (*r)
15094       || REAL_VALUE_ISINF (*r)
15095       || !real_isinteger (r, DFmode))
15096     return -1;
15097
15098   return exact_log2 (real_to_integer (r));
15099 }
15100
15101 /* If X is a vector of equal CONST_DOUBLE values and that value is
15102    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15103
15104 int
15105 aarch64_vec_fpconst_pow_of_2 (rtx x)
15106 {
15107   if (GET_CODE (x) != CONST_VECTOR)
15108     return -1;
15109
15110   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15111     return -1;
15112
15113   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15114   if (firstval <= 0)
15115     return -1;
15116
15117   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15118     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15119       return -1;
15120
15121   return firstval;
15122 }
15123
15124 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15125    to float.
15126
15127    __fp16 always promotes through this hook.
15128    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15129    through the generic excess precision logic rather than here.  */
15130
15131 static tree
15132 aarch64_promoted_type (const_tree t)
15133 {
15134   if (SCALAR_FLOAT_TYPE_P (t)
15135       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15136     return float_type_node;
15137
15138   return NULL_TREE;
15139 }
15140
15141 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15142
15143 static bool
15144 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15145                            optimization_type opt_type)
15146 {
15147   switch (op)
15148     {
15149     case rsqrt_optab:
15150       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15151
15152     default:
15153       return true;
15154     }
15155 }
15156
15157 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15158    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15159
15160 static bool
15161 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
15162 {
15163   return (mode == HFmode
15164           ? true
15165           : default_libgcc_floating_mode_supported_p (mode));
15166 }
15167
15168 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15169    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15170
15171 static bool
15172 aarch64_scalar_mode_supported_p (scalar_mode mode)
15173 {
15174   return (mode == HFmode
15175           ? true
15176           : default_scalar_mode_supported_p (mode));
15177 }
15178
15179 /* Set the value of FLT_EVAL_METHOD.
15180    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15181
15182     0: evaluate all operations and constants, whose semantic type has at
15183        most the range and precision of type float, to the range and
15184        precision of float; evaluate all other operations and constants to
15185        the range and precision of the semantic type;
15186
15187     N, where _FloatN is a supported interchange floating type
15188        evaluate all operations and constants, whose semantic type has at
15189        most the range and precision of _FloatN type, to the range and
15190        precision of the _FloatN type; evaluate all other operations and
15191        constants to the range and precision of the semantic type;
15192
15193    If we have the ARMv8.2-A extensions then we support _Float16 in native
15194    precision, so we should set this to 16.  Otherwise, we support the type,
15195    but want to evaluate expressions in float precision, so set this to
15196    0.  */
15197
15198 static enum flt_eval_method
15199 aarch64_excess_precision (enum excess_precision_type type)
15200 {
15201   switch (type)
15202     {
15203       case EXCESS_PRECISION_TYPE_FAST:
15204       case EXCESS_PRECISION_TYPE_STANDARD:
15205         /* We can calculate either in 16-bit range and precision or
15206            32-bit range and precision.  Make that decision based on whether
15207            we have native support for the ARMv8.2-A 16-bit floating-point
15208            instructions or not.  */
15209         return (TARGET_FP_F16INST
15210                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15211                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15212       case EXCESS_PRECISION_TYPE_IMPLICIT:
15213         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15214       default:
15215         gcc_unreachable ();
15216     }
15217   return FLT_EVAL_METHOD_UNPREDICTABLE;
15218 }
15219
15220 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15221    scheduled for speculative execution.  Reject the long-running division
15222    and square-root instructions.  */
15223
15224 static bool
15225 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15226 {
15227   switch (get_attr_type (insn))
15228     {
15229       case TYPE_SDIV:
15230       case TYPE_UDIV:
15231       case TYPE_FDIVS:
15232       case TYPE_FDIVD:
15233       case TYPE_FSQRTS:
15234       case TYPE_FSQRTD:
15235       case TYPE_NEON_FP_SQRT_S:
15236       case TYPE_NEON_FP_SQRT_D:
15237       case TYPE_NEON_FP_SQRT_S_Q:
15238       case TYPE_NEON_FP_SQRT_D_Q:
15239       case TYPE_NEON_FP_DIV_S:
15240       case TYPE_NEON_FP_DIV_D:
15241       case TYPE_NEON_FP_DIV_S_Q:
15242       case TYPE_NEON_FP_DIV_D_Q:
15243         return false;
15244       default:
15245         return true;
15246     }
15247 }
15248
15249 /* Target-specific selftests.  */
15250
15251 #if CHECKING_P
15252
15253 namespace selftest {
15254
15255 /* Selftest for the RTL loader.
15256    Verify that the RTL loader copes with a dump from
15257    print_rtx_function.  This is essentially just a test that class
15258    function_reader can handle a real dump, but it also verifies
15259    that lookup_reg_by_dump_name correctly handles hard regs.
15260    The presence of hard reg names in the dump means that the test is
15261    target-specific, hence it is in this file.  */
15262
15263 static void
15264 aarch64_test_loading_full_dump ()
15265 {
15266   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15267
15268   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15269
15270   rtx_insn *insn_1 = get_insn_by_uid (1);
15271   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15272
15273   rtx_insn *insn_15 = get_insn_by_uid (15);
15274   ASSERT_EQ (INSN, GET_CODE (insn_15));
15275   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15276
15277   /* Verify crtl->return_rtx.  */
15278   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15279   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15280   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15281 }
15282
15283 /* Run all target-specific selftests.  */
15284
15285 static void
15286 aarch64_run_selftests (void)
15287 {
15288   aarch64_test_loading_full_dump ();
15289 }
15290
15291 } // namespace selftest
15292
15293 #endif /* #if CHECKING_P */
15294
15295 #undef TARGET_ADDRESS_COST
15296 #define TARGET_ADDRESS_COST aarch64_address_cost
15297
15298 /* This hook will determines whether unnamed bitfields affect the alignment
15299    of the containing structure.  The hook returns true if the structure
15300    should inherit the alignment requirements of an unnamed bitfield's
15301    type.  */
15302 #undef TARGET_ALIGN_ANON_BITFIELD
15303 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15304
15305 #undef TARGET_ASM_ALIGNED_DI_OP
15306 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15307
15308 #undef TARGET_ASM_ALIGNED_HI_OP
15309 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15310
15311 #undef TARGET_ASM_ALIGNED_SI_OP
15312 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15313
15314 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15315 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15316   hook_bool_const_tree_hwi_hwi_const_tree_true
15317
15318 #undef TARGET_ASM_FILE_START
15319 #define TARGET_ASM_FILE_START aarch64_start_file
15320
15321 #undef TARGET_ASM_OUTPUT_MI_THUNK
15322 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15323
15324 #undef TARGET_ASM_SELECT_RTX_SECTION
15325 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15326
15327 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15328 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15329
15330 #undef TARGET_BUILD_BUILTIN_VA_LIST
15331 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15332
15333 #undef TARGET_CALLEE_COPIES
15334 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15335
15336 #undef TARGET_CAN_ELIMINATE
15337 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15338
15339 #undef TARGET_CAN_INLINE_P
15340 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15341
15342 #undef TARGET_CANNOT_FORCE_CONST_MEM
15343 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15344
15345 #undef TARGET_CASE_VALUES_THRESHOLD
15346 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15347
15348 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15349 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15350
15351 /* Only the least significant bit is used for initialization guard
15352    variables.  */
15353 #undef TARGET_CXX_GUARD_MASK_BIT
15354 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15355
15356 #undef TARGET_C_MODE_FOR_SUFFIX
15357 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15358
15359 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15360 #undef  TARGET_DEFAULT_TARGET_FLAGS
15361 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15362 #endif
15363
15364 #undef TARGET_CLASS_MAX_NREGS
15365 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15366
15367 #undef TARGET_BUILTIN_DECL
15368 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15369
15370 #undef TARGET_BUILTIN_RECIPROCAL
15371 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15372
15373 #undef TARGET_C_EXCESS_PRECISION
15374 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15375
15376 #undef  TARGET_EXPAND_BUILTIN
15377 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15378
15379 #undef TARGET_EXPAND_BUILTIN_VA_START
15380 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15381
15382 #undef TARGET_FOLD_BUILTIN
15383 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15384
15385 #undef TARGET_FUNCTION_ARG
15386 #define TARGET_FUNCTION_ARG aarch64_function_arg
15387
15388 #undef TARGET_FUNCTION_ARG_ADVANCE
15389 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15390
15391 #undef TARGET_FUNCTION_ARG_BOUNDARY
15392 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15393
15394 #undef TARGET_FUNCTION_ARG_PADDING
15395 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15396
15397 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15398 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15399
15400 #undef TARGET_FUNCTION_VALUE
15401 #define TARGET_FUNCTION_VALUE aarch64_function_value
15402
15403 #undef TARGET_FUNCTION_VALUE_REGNO_P
15404 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15405
15406 #undef TARGET_FRAME_POINTER_REQUIRED
15407 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15408
15409 #undef TARGET_GIMPLE_FOLD_BUILTIN
15410 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15411
15412 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15413 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15414
15415 #undef  TARGET_INIT_BUILTINS
15416 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15417
15418 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15419 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15420   aarch64_ira_change_pseudo_allocno_class
15421
15422 #undef TARGET_LEGITIMATE_ADDRESS_P
15423 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15424
15425 #undef TARGET_LEGITIMATE_CONSTANT_P
15426 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15427
15428 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15429 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15430   aarch64_legitimize_address_displacement
15431
15432 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15433 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15434
15435 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15436 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15437 aarch64_libgcc_floating_mode_supported_p
15438
15439 #undef TARGET_MANGLE_TYPE
15440 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15441
15442 #undef TARGET_MEMORY_MOVE_COST
15443 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15444
15445 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15446 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15447
15448 #undef TARGET_MUST_PASS_IN_STACK
15449 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15450
15451 /* This target hook should return true if accesses to volatile bitfields
15452    should use the narrowest mode possible.  It should return false if these
15453    accesses should use the bitfield container type.  */
15454 #undef TARGET_NARROW_VOLATILE_BITFIELD
15455 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15456
15457 #undef  TARGET_OPTION_OVERRIDE
15458 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15459
15460 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15461 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15462   aarch64_override_options_after_change
15463
15464 #undef TARGET_OPTION_SAVE
15465 #define TARGET_OPTION_SAVE aarch64_option_save
15466
15467 #undef TARGET_OPTION_RESTORE
15468 #define TARGET_OPTION_RESTORE aarch64_option_restore
15469
15470 #undef TARGET_OPTION_PRINT
15471 #define TARGET_OPTION_PRINT aarch64_option_print
15472
15473 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15474 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15475
15476 #undef TARGET_SET_CURRENT_FUNCTION
15477 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15478
15479 #undef TARGET_PASS_BY_REFERENCE
15480 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15481
15482 #undef TARGET_PREFERRED_RELOAD_CLASS
15483 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15484
15485 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15486 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15487
15488 #undef TARGET_PROMOTED_TYPE
15489 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15490
15491 #undef TARGET_SECONDARY_RELOAD
15492 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15493
15494 #undef TARGET_SHIFT_TRUNCATION_MASK
15495 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15496
15497 #undef TARGET_SETUP_INCOMING_VARARGS
15498 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15499
15500 #undef TARGET_STRUCT_VALUE_RTX
15501 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15502
15503 #undef TARGET_REGISTER_MOVE_COST
15504 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15505
15506 #undef TARGET_RETURN_IN_MEMORY
15507 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15508
15509 #undef TARGET_RETURN_IN_MSB
15510 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15511
15512 #undef TARGET_RTX_COSTS
15513 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15514
15515 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15516 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15517
15518 #undef TARGET_SCHED_ISSUE_RATE
15519 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15520
15521 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15522 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15523   aarch64_sched_first_cycle_multipass_dfa_lookahead
15524
15525 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15526 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15527   aarch64_first_cycle_multipass_dfa_lookahead_guard
15528
15529 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15530 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15531   aarch64_get_separate_components
15532
15533 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15534 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15535   aarch64_components_for_bb
15536
15537 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15538 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15539   aarch64_disqualify_components
15540
15541 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15542 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15543   aarch64_emit_prologue_components
15544
15545 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15546 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15547   aarch64_emit_epilogue_components
15548
15549 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15550 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15551   aarch64_set_handled_components
15552
15553 #undef TARGET_TRAMPOLINE_INIT
15554 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15555
15556 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15557 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15558
15559 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15560 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15561
15562 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15563 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15564   aarch64_builtin_support_vector_misalignment
15565
15566 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15567 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15568
15569 #undef TARGET_VECTORIZE_ADD_STMT_COST
15570 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15571
15572 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15573 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15574   aarch64_builtin_vectorization_cost
15575
15576 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15577 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15578
15579 #undef TARGET_VECTORIZE_BUILTINS
15580 #define TARGET_VECTORIZE_BUILTINS
15581
15582 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15583 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15584   aarch64_builtin_vectorized_function
15585
15586 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15587 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15588   aarch64_autovectorize_vector_sizes
15589
15590 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15591 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15592   aarch64_atomic_assign_expand_fenv
15593
15594 /* Section anchor support.  */
15595
15596 #undef TARGET_MIN_ANCHOR_OFFSET
15597 #define TARGET_MIN_ANCHOR_OFFSET -256
15598
15599 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15600    byte offset; we can do much more for larger data types, but have no way
15601    to determine the size of the access.  We assume accesses are aligned.  */
15602 #undef TARGET_MAX_ANCHOR_OFFSET
15603 #define TARGET_MAX_ANCHOR_OFFSET 4095
15604
15605 #undef TARGET_VECTOR_ALIGNMENT
15606 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15607
15608 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15609 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15610   aarch64_simd_vector_alignment_reachable
15611
15612 /* vec_perm support.  */
15613
15614 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15615 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15616   aarch64_vectorize_vec_perm_const_ok
15617
15618 #undef TARGET_INIT_LIBFUNCS
15619 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15620
15621 #undef TARGET_FIXED_CONDITION_CODE_REGS
15622 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15623
15624 #undef TARGET_FLAGS_REGNUM
15625 #define TARGET_FLAGS_REGNUM CC_REGNUM
15626
15627 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15628 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15629
15630 #undef TARGET_ASAN_SHADOW_OFFSET
15631 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15632
15633 #undef TARGET_LEGITIMIZE_ADDRESS
15634 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15635
15636 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15637 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15638   aarch64_use_by_pieces_infrastructure_p
15639
15640 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15641 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15642
15643 #undef TARGET_CAN_USE_DOLOOP_P
15644 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15645
15646 #undef TARGET_SCHED_ADJUST_PRIORITY
15647 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15648
15649 #undef TARGET_SCHED_MACRO_FUSION_P
15650 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15651
15652 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15653 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15654
15655 #undef TARGET_SCHED_FUSION_PRIORITY
15656 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15657
15658 #undef TARGET_UNSPEC_MAY_TRAP_P
15659 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15660
15661 #undef TARGET_USE_PSEUDO_PIC_REG
15662 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15663
15664 #undef TARGET_PRINT_OPERAND
15665 #define TARGET_PRINT_OPERAND aarch64_print_operand
15666
15667 #undef TARGET_PRINT_OPERAND_ADDRESS
15668 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15669
15670 #undef TARGET_OPTAB_SUPPORTED_P
15671 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15672
15673 #undef TARGET_OMIT_STRUCT_RETURN_REG
15674 #define TARGET_OMIT_STRUCT_RETURN_REG true
15675
15676 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15677 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15678 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15679
15680 #undef TARGET_HARD_REGNO_NREGS
15681 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15682 #undef TARGET_HARD_REGNO_MODE_OK
15683 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15684
15685 #undef TARGET_MODES_TIEABLE_P
15686 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15687
15688 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15689 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15690   aarch64_hard_regno_call_part_clobbered
15691
15692 #undef TARGET_CONSTANT_ALIGNMENT
15693 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
15694
15695 #if CHECKING_P
15696 #undef TARGET_RUN_TARGET_SELFTESTS
15697 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15698 #endif /* #if CHECKING_P */
15699
15700 struct gcc_target targetm = TARGET_INITIALIZER;
15701
15702 #include "gt-aarch64.h"