gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2017 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "attribs.h"
  37 #include "optabs.h"
  38 #include "regs.h"
  39 #include "emit-rtl.h"
  40 #include "recog.h"
  41 #include "diagnostic.h"
  42 #include "insn-attr.h"
  43 #include "alias.h"
  44 #include "fold-const.h"
  45 #include "stor-layout.h"
  46 #include "calls.h"
  47 #include "varasm.h"
  48 #include "output.h"
  49 #include "flags.h"
  50 #include "explow.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "langhooks.h"
  54 #include "opts.h"
  55 #include "params.h"
  56 #include "gimplify.h"
  57 #include "dwarf2.h"
  58 #include "gimple-iterator.h"
  59 #include "tree-vectorizer.h"
  60 #include "aarch64-cost-tables.h"
  61 #include "dumpfile.h"
  62 #include "builtins.h"
  63 #include "rtl-iter.h"
  64 #include "tm-constrs.h"
  65 #include "sched-int.h"
  66 #include "target-globals.h"
  67 #include "common/common-target.h"
  68 #include "selftest.h"
  69 #include "selftest-rtl.h"
  70
  71 /* This file should be included last.  */
  72 #include "target-def.h"
  73
  74 /* Defined for convenience.  */
  75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  76
  77 /* Classifies an address.
  78
  79    ADDRESS_REG_IMM
  80        A simple base register plus immediate offset.
  81
  82    ADDRESS_REG_WB
  83        A base register indexed by immediate offset with writeback.
  84
  85    ADDRESS_REG_REG
  86        A base register indexed by (optionally scaled) register.
  87
  88    ADDRESS_REG_UXTW
  89        A base register indexed by (optionally scaled) zero-extended register.
  90
  91    ADDRESS_REG_SXTW
  92        A base register indexed by (optionally scaled) sign-extended register.
  93
  94    ADDRESS_LO_SUM
  95        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  96
  97    ADDRESS_SYMBOLIC:
  98        A constant symbolic address, in pc-relative literal pool.  */
  99
 100 enum aarch64_address_type {
 101   ADDRESS_REG_IMM,
 102   ADDRESS_REG_WB,
 103   ADDRESS_REG_REG,
 104   ADDRESS_REG_UXTW,
 105   ADDRESS_REG_SXTW,
 106   ADDRESS_LO_SUM,
 107   ADDRESS_SYMBOLIC
 108 };
 109
 110 struct aarch64_address_info {
 111   enum aarch64_address_type type;
 112   rtx base;
 113   rtx offset;
 114   int shift;
 115   enum aarch64_symbol_type symbol_type;
 116 };
 117
 118 struct simd_immediate_info
 119 {
 120   rtx value;
 121   int shift;
 122   int element_width;
 123   bool mvn;
 124   bool msl;
 125 };
 126
 127 /* The current code model.  */
 128 enum aarch64_code_model aarch64_cmodel;
 129
 130 #ifdef HAVE_AS_TLS
 131 #undef TARGET_HAVE_TLS
 132 #define TARGET_HAVE_TLS 1
 133 #endif
 134
 135 static bool aarch64_composite_type_p (const_tree, machine_mode);
 136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 137                                                      const_tree,
 138                                                      machine_mode *, int *,
 139                                                      bool *);
 140 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 141 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 142 static void aarch64_override_options_after_change (void);
 143 static bool aarch64_vector_mode_supported_p (machine_mode);
 144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 145                                                  const unsigned char *sel);
 146 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 148                                                          const_tree type,
 149                                                          int misalignment,
 150                                                          bool is_packed);
 151 static machine_mode
 152 aarch64_simd_container_mode (machine_mode mode, unsigned width);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instruction scheduling options should be used.  */
 161 unsigned long aarch64_tune_flags = 0;
 162
 163 /* Global flag for PC relative loads.  */
 164 bool aarch64_pcrelative_literal_loads;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183
 184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 185   { name, AARCH64_EXTRA_TUNE_##internal_name },
 186 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 187 {
 188   { "none", AARCH64_EXTRA_TUNE_NONE },
 189 #include "aarch64-tuning-flags.def"
 190   { "all", AARCH64_EXTRA_TUNE_ALL },
 191   { NULL, AARCH64_EXTRA_TUNE_NONE }
 192 };
 193
 194 /* Tuning parameters.  */
 195
 196 static const struct cpu_addrcost_table generic_addrcost_table =
 197 {
 198     {
 199       1, /* hi  */
 200       0, /* si  */
 201       0, /* di  */
 202       1, /* ti  */
 203     },
 204   0, /* pre_modify  */
 205   0, /* post_modify  */
 206   0, /* register_offset  */
 207   0, /* register_sextend  */
 208   0, /* register_zextend  */
 209   0 /* imm_offset  */
 210 };
 211
 212 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 213 {
 214     {
 215       0, /* hi  */
 216       0, /* si  */
 217       0, /* di  */
 218       2, /* ti  */
 219     },
 220   0, /* pre_modify  */
 221   0, /* post_modify  */
 222   1, /* register_offset  */
 223   1, /* register_sextend  */
 224   2, /* register_zextend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_sextend  */
 240   1, /* register_zextend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 245 {
 246     {
 247       1, /* hi  */
 248       1, /* si  */
 249       1, /* di  */
 250       2, /* ti  */
 251     },
 252   0, /* pre_modify  */
 253   0, /* post_modify  */
 254   2, /* register_offset  */
 255   3, /* register_sextend  */
 256   3, /* register_zextend  */
 257   0, /* imm_offset  */
 258 };
 259
 260 static const struct cpu_regmove_cost generic_regmove_cost =
 261 {
 262   1, /* GP2GP  */
 263   /* Avoid the use of slow int<->fp moves for spilling by setting
 264      their cost higher than memmov_cost.  */
 265   5, /* GP2FP  */
 266   5, /* FP2GP  */
 267   2 /* FP2FP  */
 268 };
 269
 270 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 271 {
 272   1, /* GP2GP  */
 273   /* Avoid the use of slow int<->fp moves for spilling by setting
 274      their cost higher than memmov_cost.  */
 275   5, /* GP2FP  */
 276   5, /* FP2GP  */
 277   2 /* FP2FP  */
 278 };
 279
 280 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 281 {
 282   1, /* GP2GP  */
 283   /* Avoid the use of slow int<->fp moves for spilling by setting
 284      their cost higher than memmov_cost.  */
 285   5, /* GP2FP  */
 286   5, /* FP2GP  */
 287   2 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost (actual, 4 and 9).  */
 295   9, /* GP2FP  */
 296   9, /* FP2GP  */
 297   1 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost thunderx_regmove_cost =
 301 {
 302   2, /* GP2GP  */
 303   2, /* GP2FP  */
 304   6, /* FP2GP  */
 305   4 /* FP2FP  */
 306 };
 307
 308 static const struct cpu_regmove_cost xgene1_regmove_cost =
 309 {
 310   1, /* GP2GP  */
 311   /* Avoid the use of slow int<->fp moves for spilling by setting
 312      their cost higher than memmov_cost.  */
 313   8, /* GP2FP  */
 314   8, /* FP2GP  */
 315   2 /* FP2FP  */
 316 };
 317
 318 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 319 {
 320   2, /* GP2GP  */
 321   /* Avoid the use of int<->fp moves for spilling.  */
 322   6, /* GP2FP  */
 323   6, /* FP2GP  */
 324   4 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of int<->fp moves for spilling.  */
 331   8, /* GP2FP  */
 332   8, /* FP2GP  */
 333   4  /* FP2FP  */
 334 };
 335
 336 /* Generic costs for vector insn classes.  */
 337 static const struct cpu_vector_cost generic_vector_cost =
 338 {
 339   1, /* scalar_int_stmt_cost  */
 340   1, /* scalar_fp_stmt_cost  */
 341   1, /* scalar_load_cost  */
 342   1, /* scalar_store_cost  */
 343   1, /* vec_int_stmt_cost  */
 344   1, /* vec_fp_stmt_cost  */
 345   2, /* vec_permute_cost  */
 346   1, /* vec_to_scalar_cost  */
 347   1, /* scalar_to_vec_cost  */
 348   1, /* vec_align_load_cost  */
 349   1, /* vec_unalign_load_cost  */
 350   1, /* vec_unalign_store_cost  */
 351   1, /* vec_store_cost  */
 352   3, /* cond_taken_branch_cost  */
 353   1 /* cond_not_taken_branch_cost  */
 354 };
 355
 356 /* ThunderX costs for vector insn classes.  */
 357 static const struct cpu_vector_cost thunderx_vector_cost =
 358 {
 359   1, /* scalar_int_stmt_cost  */
 360   1, /* scalar_fp_stmt_cost  */
 361   3, /* scalar_load_cost  */
 362   1, /* scalar_store_cost  */
 363   4, /* vec_int_stmt_cost  */
 364   1, /* vec_fp_stmt_cost  */
 365   4, /* vec_permute_cost  */
 366   2, /* vec_to_scalar_cost  */
 367   2, /* scalar_to_vec_cost  */
 368   3, /* vec_align_load_cost  */
 369   5, /* vec_unalign_load_cost  */
 370   5, /* vec_unalign_store_cost  */
 371   1, /* vec_store_cost  */
 372   3, /* cond_taken_branch_cost  */
 373   3 /* cond_not_taken_branch_cost  */
 374 };
 375
 376 /* Generic costs for vector insn classes.  */
 377 static const struct cpu_vector_cost cortexa57_vector_cost =
 378 {
 379   1, /* scalar_int_stmt_cost  */
 380   1, /* scalar_fp_stmt_cost  */
 381   4, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   2, /* vec_int_stmt_cost  */
 384   2, /* vec_fp_stmt_cost  */
 385   3, /* vec_permute_cost  */
 386   8, /* vec_to_scalar_cost  */
 387   8, /* scalar_to_vec_cost  */
 388   4, /* vec_align_load_cost  */
 389   4, /* vec_unalign_load_cost  */
 390   1, /* vec_unalign_store_cost  */
 391   1, /* vec_store_cost  */
 392   1, /* cond_taken_branch_cost  */
 393   1 /* cond_not_taken_branch_cost  */
 394 };
 395
 396 static const struct cpu_vector_cost exynosm1_vector_cost =
 397 {
 398   1, /* scalar_int_stmt_cost  */
 399   1, /* scalar_fp_stmt_cost  */
 400   5, /* scalar_load_cost  */
 401   1, /* scalar_store_cost  */
 402   3, /* vec_int_stmt_cost  */
 403   3, /* vec_fp_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   3, /* vec_to_scalar_cost  */
 406   3, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 /* Generic costs for vector insn classes.  */
 416 static const struct cpu_vector_cost xgene1_vector_cost =
 417 {
 418   1, /* scalar_int_stmt_cost  */
 419   1, /* scalar_fp_stmt_cost  */
 420   5, /* scalar_load_cost  */
 421   1, /* scalar_store_cost  */
 422   2, /* vec_int_stmt_cost  */
 423   2, /* vec_fp_stmt_cost  */
 424   2, /* vec_permute_cost  */
 425   4, /* vec_to_scalar_cost  */
 426   4, /* scalar_to_vec_cost  */
 427   10, /* vec_align_load_cost  */
 428   10, /* vec_unalign_load_cost  */
 429   2, /* vec_unalign_store_cost  */
 430   2, /* vec_store_cost  */
 431   2, /* cond_taken_branch_cost  */
 432   1 /* cond_not_taken_branch_cost  */
 433 };
 434
 435 /* Costs for vector insn classes for Vulcan.  */
 436 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 437 {
 438   1, /* scalar_int_stmt_cost  */
 439   6, /* scalar_fp_stmt_cost  */
 440   4, /* scalar_load_cost  */
 441   1, /* scalar_store_cost  */
 442   5, /* vec_int_stmt_cost  */
 443   6, /* vec_fp_stmt_cost  */
 444   3, /* vec_permute_cost  */
 445   6, /* vec_to_scalar_cost  */
 446   5, /* scalar_to_vec_cost  */
 447   8, /* vec_align_load_cost  */
 448   8, /* vec_unalign_load_cost  */
 449   4, /* vec_unalign_store_cost  */
 450   4, /* vec_store_cost  */
 451   2, /* cond_taken_branch_cost  */
 452   1  /* cond_not_taken_branch_cost  */
 453 };
 454
 455 /* Generic costs for branch instructions.  */
 456 static const struct cpu_branch_cost generic_branch_cost =
 457 {
 458   1,  /* Predictable.  */
 459   3   /* Unpredictable.  */
 460 };
 461
 462 /* Generic approximation modes.  */
 463 static const cpu_approx_modes generic_approx_modes =
 464 {
 465   AARCH64_APPROX_NONE,  /* division  */
 466   AARCH64_APPROX_NONE,  /* sqrt  */
 467   AARCH64_APPROX_NONE   /* recip_sqrt  */
 468 };
 469
 470 /* Approximation modes for Exynos M1.  */
 471 static const cpu_approx_modes exynosm1_approx_modes =
 472 {
 473   AARCH64_APPROX_NONE,  /* division  */
 474   AARCH64_APPROX_ALL,   /* sqrt  */
 475   AARCH64_APPROX_ALL    /* recip_sqrt  */
 476 };
 477
 478 /* Approximation modes for X-Gene 1.  */
 479 static const cpu_approx_modes xgene1_approx_modes =
 480 {
 481   AARCH64_APPROX_NONE,  /* division  */
 482   AARCH64_APPROX_NONE,  /* sqrt  */
 483   AARCH64_APPROX_ALL    /* recip_sqrt  */
 484 };
 485
 486 /* Generic prefetch settings (which disable prefetch).  */
 487 static const cpu_prefetch_tune generic_prefetch_tune =
 488 {
 489   0,                    /* num_slots  */
 490   -1,                   /* l1_cache_size  */
 491   -1,                   /* l1_cache_line_size  */
 492   -1,                   /* l2_cache_size  */
 493   -1                    /* default_opt_level  */
 494 };
 495
 496 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 497 {
 498   0,                    /* num_slots  */
 499   -1,                   /* l1_cache_size  */
 500   64,                   /* l1_cache_line_size  */
 501   -1,                   /* l2_cache_size  */
 502   -1                    /* default_opt_level  */
 503 };
 504
 505 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 506 {
 507   4,                    /* num_slots  */
 508   32,                   /* l1_cache_size  */
 509   64,                   /* l1_cache_line_size  */
 510   1024,                 /* l2_cache_size  */
 511   3                     /* default_opt_level  */
 512 };
 513
 514 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 515 {
 516   8,                    /* num_slots  */
 517   32,                   /* l1_cache_size  */
 518   128,                  /* l1_cache_line_size  */
 519   16*1024,              /* l2_cache_size  */
 520   3                     /* default_opt_level  */
 521 };
 522
 523 static const cpu_prefetch_tune thunderx_prefetch_tune =
 524 {
 525   8,                    /* num_slots  */
 526   32,                   /* l1_cache_size  */
 527   128,                  /* l1_cache_line_size  */
 528   -1,                   /* l2_cache_size  */
 529   -1                    /* default_opt_level  */
 530 };
 531
 532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 533 {
 534   8,                    /* num_slots  */
 535   32,                   /* l1_cache_size  */
 536   64,                   /* l1_cache_line_size  */
 537   256,                  /* l2_cache_size  */
 538   -1                    /* default_opt_level  */
 539 };
 540
 541 static const struct tune_params generic_tunings =
 542 {
 543   &cortexa57_extra_costs,
 544   &generic_addrcost_table,
 545   &generic_regmove_cost,
 546   &generic_vector_cost,
 547   &generic_branch_cost,
 548   &generic_approx_modes,
 549   4, /* memmov_cost  */
 550   2, /* issue_rate  */
 551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 552   8,    /* function_align.  */
 553   4,    /* jump_align.  */
 554   8,    /* loop_align.  */
 555   2,    /* int_reassoc_width.  */
 556   4,    /* fp_reassoc_width.  */
 557   1,    /* vec_reassoc_width.  */
 558   2,    /* min_div_recip_mul_sf.  */
 559   2,    /* min_div_recip_mul_df.  */
 560   0,    /* max_case_values.  */
 561   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 562   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 563   &generic_prefetch_tune
 564 };
 565
 566 static const struct tune_params cortexa35_tunings =
 567 {
 568   &cortexa53_extra_costs,
 569   &generic_addrcost_table,
 570   &cortexa53_regmove_cost,
 571   &generic_vector_cost,
 572   &generic_branch_cost,
 573   &generic_approx_modes,
 574   4, /* memmov_cost  */
 575   1, /* issue_rate  */
 576   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 577    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 578   16,   /* function_align.  */
 579   4,    /* jump_align.  */
 580   8,    /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 588   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 589   &generic_prefetch_tune
 590 };
 591
 592 static const struct tune_params cortexa53_tunings =
 593 {
 594   &cortexa53_extra_costs,
 595   &generic_addrcost_table,
 596   &cortexa53_regmove_cost,
 597   &generic_vector_cost,
 598   &generic_branch_cost,
 599   &generic_approx_modes,
 600   4, /* memmov_cost  */
 601   2, /* issue_rate  */
 602   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 603    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 604   16,   /* function_align.  */
 605   4,    /* jump_align.  */
 606   8,    /* loop_align.  */
 607   2,    /* int_reassoc_width.  */
 608   4,    /* fp_reassoc_width.  */
 609   1,    /* vec_reassoc_width.  */
 610   2,    /* min_div_recip_mul_sf.  */
 611   2,    /* min_div_recip_mul_df.  */
 612   0,    /* max_case_values.  */
 613   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 614   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 615   &generic_prefetch_tune
 616 };
 617
 618 static const struct tune_params cortexa57_tunings =
 619 {
 620   &cortexa57_extra_costs,
 621   &generic_addrcost_table,
 622   &cortexa57_regmove_cost,
 623   &cortexa57_vector_cost,
 624   &generic_branch_cost,
 625   &generic_approx_modes,
 626   4, /* memmov_cost  */
 627   3, /* issue_rate  */
 628   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 629    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 630   16,   /* function_align.  */
 631   4,    /* jump_align.  */
 632   8,    /* loop_align.  */
 633   2,    /* int_reassoc_width.  */
 634   4,    /* fp_reassoc_width.  */
 635   1,    /* vec_reassoc_width.  */
 636   2,    /* min_div_recip_mul_sf.  */
 637   2,    /* min_div_recip_mul_df.  */
 638   0,    /* max_case_values.  */
 639   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 640   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 641   &generic_prefetch_tune
 642 };
 643
 644 static const struct tune_params cortexa72_tunings =
 645 {
 646   &cortexa57_extra_costs,
 647   &generic_addrcost_table,
 648   &cortexa57_regmove_cost,
 649   &cortexa57_vector_cost,
 650   &generic_branch_cost,
 651   &generic_approx_modes,
 652   4, /* memmov_cost  */
 653   3, /* issue_rate  */
 654   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 655    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 656   16,   /* function_align.  */
 657   4,    /* jump_align.  */
 658   8,    /* loop_align.  */
 659   2,    /* int_reassoc_width.  */
 660   4,    /* fp_reassoc_width.  */
 661   1,    /* vec_reassoc_width.  */
 662   2,    /* min_div_recip_mul_sf.  */
 663   2,    /* min_div_recip_mul_df.  */
 664   0,    /* max_case_values.  */
 665   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 666   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 667   &generic_prefetch_tune
 668 };
 669
 670 static const struct tune_params cortexa73_tunings =
 671 {
 672   &cortexa57_extra_costs,
 673   &generic_addrcost_table,
 674   &cortexa57_regmove_cost,
 675   &cortexa57_vector_cost,
 676   &generic_branch_cost,
 677   &generic_approx_modes,
 678   4, /* memmov_cost.  */
 679   2, /* issue_rate.  */
 680   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 681    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 682   16,   /* function_align.  */
 683   4,    /* jump_align.  */
 684   8,    /* loop_align.  */
 685   2,    /* int_reassoc_width.  */
 686   4,    /* fp_reassoc_width.  */
 687   1,    /* vec_reassoc_width.  */
 688   2,    /* min_div_recip_mul_sf.  */
 689   2,    /* min_div_recip_mul_df.  */
 690   0,    /* max_case_values.  */
 691   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 692   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 693   &generic_prefetch_tune
 694 };
 695
 696
 697
 698 static const struct tune_params exynosm1_tunings =
 699 {
 700   &exynosm1_extra_costs,
 701   &exynosm1_addrcost_table,
 702   &exynosm1_regmove_cost,
 703   &exynosm1_vector_cost,
 704   &generic_branch_cost,
 705   &exynosm1_approx_modes,
 706   4,    /* memmov_cost  */
 707   3,    /* issue_rate  */
 708   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 709   4,    /* function_align.  */
 710   4,    /* jump_align.  */
 711   4,    /* loop_align.  */
 712   2,    /* int_reassoc_width.  */
 713   4,    /* fp_reassoc_width.  */
 714   1,    /* vec_reassoc_width.  */
 715   2,    /* min_div_recip_mul_sf.  */
 716   2,    /* min_div_recip_mul_df.  */
 717   48,   /* max_case_values.  */
 718   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 719   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 720   &exynosm1_prefetch_tune
 721 };
 722
 723 static const struct tune_params thunderxt88_tunings =
 724 {
 725   &thunderx_extra_costs,
 726   &generic_addrcost_table,
 727   &thunderx_regmove_cost,
 728   &thunderx_vector_cost,
 729   &generic_branch_cost,
 730   &generic_approx_modes,
 731   6, /* memmov_cost  */
 732   2, /* issue_rate  */
 733   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 734   8,    /* function_align.  */
 735   8,    /* jump_align.  */
 736   8,    /* loop_align.  */
 737   2,    /* int_reassoc_width.  */
 738   4,    /* fp_reassoc_width.  */
 739   1,    /* vec_reassoc_width.  */
 740   2,    /* min_div_recip_mul_sf.  */
 741   2,    /* min_div_recip_mul_df.  */
 742   0,    /* max_case_values.  */
 743   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 744   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 745   &thunderxt88_prefetch_tune
 746 };
 747
 748 static const struct tune_params thunderx_tunings =
 749 {
 750   &thunderx_extra_costs,
 751   &generic_addrcost_table,
 752   &thunderx_regmove_cost,
 753   &thunderx_vector_cost,
 754   &generic_branch_cost,
 755   &generic_approx_modes,
 756   6, /* memmov_cost  */
 757   2, /* issue_rate  */
 758   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 759   8,    /* function_align.  */
 760   8,    /* jump_align.  */
 761   8,    /* loop_align.  */
 762   2,    /* int_reassoc_width.  */
 763   4,    /* fp_reassoc_width.  */
 764   1,    /* vec_reassoc_width.  */
 765   2,    /* min_div_recip_mul_sf.  */
 766   2,    /* min_div_recip_mul_df.  */
 767   0,    /* max_case_values.  */
 768   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 769   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 770    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 771   &thunderx_prefetch_tune
 772 };
 773
 774 static const struct tune_params xgene1_tunings =
 775 {
 776   &xgene1_extra_costs,
 777   &xgene1_addrcost_table,
 778   &xgene1_regmove_cost,
 779   &xgene1_vector_cost,
 780   &generic_branch_cost,
 781   &xgene1_approx_modes,
 782   6, /* memmov_cost  */
 783   4, /* issue_rate  */
 784   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 785   16,   /* function_align.  */
 786   8,    /* jump_align.  */
 787   16,   /* loop_align.  */
 788   2,    /* int_reassoc_width.  */
 789   4,    /* fp_reassoc_width.  */
 790   1,    /* vec_reassoc_width.  */
 791   2,    /* min_div_recip_mul_sf.  */
 792   2,    /* min_div_recip_mul_df.  */
 793   0,    /* max_case_values.  */
 794   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 795   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 796   &generic_prefetch_tune
 797 };
 798
 799 static const struct tune_params qdf24xx_tunings =
 800 {
 801   &qdf24xx_extra_costs,
 802   &generic_addrcost_table,
 803   &qdf24xx_regmove_cost,
 804   &generic_vector_cost,
 805   &generic_branch_cost,
 806   &generic_approx_modes,
 807   4, /* memmov_cost  */
 808   4, /* issue_rate  */
 809   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 810    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 811   16,   /* function_align.  */
 812   8,    /* jump_align.  */
 813   16,   /* loop_align.  */
 814   2,    /* int_reassoc_width.  */
 815   4,    /* fp_reassoc_width.  */
 816   1,    /* vec_reassoc_width.  */
 817   2,    /* min_div_recip_mul_sf.  */
 818   2,    /* min_div_recip_mul_df.  */
 819   0,    /* max_case_values.  */
 820   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 821   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 822   &qdf24xx_prefetch_tune
 823 };
 824
 825 static const struct tune_params thunderx2t99_tunings =
 826 {
 827   &thunderx2t99_extra_costs,
 828   &thunderx2t99_addrcost_table,
 829   &thunderx2t99_regmove_cost,
 830   &thunderx2t99_vector_cost,
 831   &generic_branch_cost,
 832   &generic_approx_modes,
 833   4, /* memmov_cost.  */
 834   4, /* issue_rate.  */
 835   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 836    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 837   16,   /* function_align.  */
 838   8,    /* jump_align.  */
 839   16,   /* loop_align.  */
 840   3,    /* int_reassoc_width.  */
 841   2,    /* fp_reassoc_width.  */
 842   2,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &thunderx2t99_prefetch_tune
 849 };
 850
 851 /* Support for fine-grained override of the tuning structures.  */
 852 struct aarch64_tuning_override_function
 853 {
 854   const char* name;
 855   void (*parse_override)(const char*, struct tune_params*);
 856 };
 857
 858 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 859 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 860
 861 static const struct aarch64_tuning_override_function
 862 aarch64_tuning_override_functions[] =
 863 {
 864   { "fuse", aarch64_parse_fuse_string },
 865   { "tune", aarch64_parse_tune_string },
 866   { NULL, NULL }
 867 };
 868
 869 /* A processor implementing AArch64.  */
 870 struct processor
 871 {
 872   const char *const name;
 873   enum aarch64_processor ident;
 874   enum aarch64_processor sched_core;
 875   enum aarch64_arch arch;
 876   unsigned architecture_version;
 877   const unsigned long flags;
 878   const struct tune_params *const tune;
 879 };
 880
 881 /* Architectures implementing AArch64.  */
 882 static const struct processor all_architectures[] =
 883 {
 884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 885   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 886 #include "aarch64-arches.def"
 887   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 888 };
 889
 890 /* Processor cores implementing AArch64.  */
 891 static const struct processor all_cores[] =
 892 {
 893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 894   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 895   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 896   FLAGS, &COSTS##_tunings},
 897 #include "aarch64-cores.def"
 898   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 899     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 900   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 901 };
 902
 903
 904 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 905    handling code or by target attributes.  */
 906 static const struct processor *selected_arch;
 907 static const struct processor *selected_cpu;
 908 static const struct processor *selected_tune;
 909
 910 /* The current tuning set.  */
 911 struct tune_params aarch64_tune_params = generic_tunings;
 912
 913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 914
 915 /* An ISA extension in the co-processor and main instruction set space.  */
 916 struct aarch64_option_extension
 917 {
 918   const char *const name;
 919   const unsigned long flags_on;
 920   const unsigned long flags_off;
 921 };
 922
 923 typedef enum aarch64_cond_code
 924 {
 925   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 926   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 927   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 928 }
 929 aarch64_cc;
 930
 931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 932
 933 /* The condition codes of the processor, and the inverse function.  */
 934 static const char * const aarch64_condition_codes[] =
 935 {
 936   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 937   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 938 };
 939
 940 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 941 const char *
 942 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 943                         const char * branch_format)
 944 {
 945     rtx_code_label * tmp_label = gen_label_rtx ();
 946     char label_buf[256];
 947     char buffer[128];
 948     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 949                                  CODE_LABEL_NUMBER (tmp_label));
 950     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 951     rtx dest_label = operands[pos_label];
 952     operands[pos_label] = tmp_label;
 953
 954     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 955     output_asm_insn (buffer, operands);
 956
 957     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 958     operands[pos_label] = dest_label;
 959     output_asm_insn (buffer, operands);
 960     return "";
 961 }
 962
 963 void
 964 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 965 {
 966   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 967   if (TARGET_GENERAL_REGS_ONLY)
 968     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 969   else
 970     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 971 }
 972
 973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 974    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 975    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 976    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 977    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 978    irrespectively of its cost results in bad allocations with many redundant
 979    int<->FP moves which are expensive on various cores.
 980    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 981    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 982    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 983    Otherwise set the allocno class depending on the mode.
 984    The result of this is that it is no longer inefficient to have a higher
 985    memory move cost than the register move cost.
 986 */
 987
 988 static reg_class_t
 989 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 990                                          reg_class_t best_class)
 991 {
 992   machine_mode mode;
 993
 994   if (allocno_class != ALL_REGS)
 995     return allocno_class;
 996
 997   if (best_class != ALL_REGS)
 998     return best_class;
 999
1000   mode = PSEUDO_REGNO_MODE (regno);
1001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1002 }
1003
1004 static unsigned int
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1006 {
1007   if (GET_MODE_UNIT_SIZE (mode) == 4)
1008     return aarch64_tune_params.min_div_recip_mul_sf;
1009   return aarch64_tune_params.min_div_recip_mul_df;
1010 }
1011
1012 static int
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1014                              machine_mode mode)
1015 {
1016   if (VECTOR_MODE_P (mode))
1017     return aarch64_tune_params.vec_reassoc_width;
1018   if (INTEGRAL_MODE_P (mode))
1019     return aarch64_tune_params.int_reassoc_width;
1020   if (FLOAT_MODE_P (mode))
1021     return aarch64_tune_params.fp_reassoc_width;
1022   return 1;
1023 }
1024
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1026 unsigned
1027 aarch64_dbx_register_number (unsigned regno)
1028 {
1029    if (GP_REGNUM_P (regno))
1030      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1031    else if (regno == SP_REGNUM)
1032      return AARCH64_DWARF_SP;
1033    else if (FP_REGNUM_P (regno))
1034      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1035
1036    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037       equivalent DWARF register.  */
1038    return DWARF_FRAME_REGISTERS;
1039 }
1040
1041 /* Return TRUE if MODE is any of the large INT modes.  */
1042 static bool
1043 aarch64_vect_struct_mode_p (machine_mode mode)
1044 {
1045   return mode == OImode || mode == CImode || mode == XImode;
1046 }
1047
1048 /* Return TRUE if MODE is any of the vector modes.  */
1049 static bool
1050 aarch64_vector_mode_p (machine_mode mode)
1051 {
1052   return aarch64_vector_mode_supported_p (mode)
1053          || aarch64_vect_struct_mode_p (mode);
1054 }
1055
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1057 static bool
1058 aarch64_array_mode_supported_p (machine_mode mode,
1059                                 unsigned HOST_WIDE_INT nelems)
1060 {
1061   if (TARGET_SIMD
1062       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1063           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1064       && (nelems >= 2 && nelems <= 4))
1065     return true;
1066
1067   return false;
1068 }
1069
1070 /* Implement HARD_REGNO_NREGS.  */
1071
1072 int
1073 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1074 {
1075   switch (aarch64_regno_regclass (regno))
1076     {
1077     case FP_REGS:
1078     case FP_LO_REGS:
1079       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1080     default:
1081       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1082     }
1083   gcc_unreachable ();
1084 }
1085
1086 /* Implement HARD_REGNO_MODE_OK.  */
1087
1088 int
1089 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1090 {
1091   if (GET_MODE_CLASS (mode) == MODE_CC)
1092     return regno == CC_REGNUM;
1093
1094   if (regno == SP_REGNUM)
1095     /* The purpose of comparing with ptr_mode is to support the
1096        global register variable associated with the stack pointer
1097        register via the syntax of asm ("wsp") in ILP32.  */
1098     return mode == Pmode || mode == ptr_mode;
1099
1100   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1101     return mode == Pmode;
1102
1103   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1104     return 1;
1105
1106   if (FP_REGNUM_P (regno))
1107     {
1108       if (aarch64_vect_struct_mode_p (mode))
1109         return
1110           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1111       else
1112         return 1;
1113     }
1114
1115   return 0;
1116 }
1117
1118 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1119 machine_mode
1120 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1121                                      machine_mode mode)
1122 {
1123   /* Handle modes that fit within single registers.  */
1124   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1125     {
1126       if (GET_MODE_SIZE (mode) >= 4)
1127         return mode;
1128       else
1129         return SImode;
1130     }
1131   /* Fall back to generic for multi-reg and very large modes.  */
1132   else
1133     return choose_hard_reg_mode (regno, nregs, false);
1134 }
1135
1136 /* Return true if calls to DECL should be treated as
1137    long-calls (ie called via a register).  */
1138 static bool
1139 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1140 {
1141   return false;
1142 }
1143
1144 /* Return true if calls to symbol-ref SYM should be treated as
1145    long-calls (ie called via a register).  */
1146 bool
1147 aarch64_is_long_call_p (rtx sym)
1148 {
1149   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1150 }
1151
1152 /* Return true if calls to symbol-ref SYM should not go through
1153    plt stubs.  */
1154
1155 bool
1156 aarch64_is_noplt_call_p (rtx sym)
1157 {
1158   const_tree decl = SYMBOL_REF_DECL (sym);
1159
1160   if (flag_pic
1161       && decl
1162       && (!flag_plt
1163           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1164       && !targetm.binds_local_p (decl))
1165     return true;
1166
1167   return false;
1168 }
1169
1170 /* Return true if the offsets to a zero/sign-extract operation
1171    represent an expression that matches an extend operation.  The
1172    operands represent the paramters from
1173
1174    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1175 bool
1176 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1177                                 rtx extract_imm)
1178 {
1179   HOST_WIDE_INT mult_val, extract_val;
1180
1181   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1182     return false;
1183
1184   mult_val = INTVAL (mult_imm);
1185   extract_val = INTVAL (extract_imm);
1186
1187   if (extract_val > 8
1188       && extract_val < GET_MODE_BITSIZE (mode)
1189       && exact_log2 (extract_val & ~7) > 0
1190       && (extract_val & 7) <= 4
1191       && mult_val == (1 << (extract_val & 7)))
1192     return true;
1193
1194   return false;
1195 }
1196
1197 /* Emit an insn that's a simple single-set.  Both the operands must be
1198    known to be valid.  */
1199 inline static rtx_insn *
1200 emit_set_insn (rtx x, rtx y)
1201 {
1202   return emit_insn (gen_rtx_SET (x, y));
1203 }
1204
1205 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1206    return the rtx for register 0 in the proper mode.  */
1207 rtx
1208 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1209 {
1210   machine_mode mode = SELECT_CC_MODE (code, x, y);
1211   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1212
1213   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1214   return cc_reg;
1215 }
1216
1217 /* Build the SYMBOL_REF for __tls_get_addr.  */
1218
1219 static GTY(()) rtx tls_get_addr_libfunc;
1220
1221 rtx
1222 aarch64_tls_get_addr (void)
1223 {
1224   if (!tls_get_addr_libfunc)
1225     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1226   return tls_get_addr_libfunc;
1227 }
1228
1229 /* Return the TLS model to use for ADDR.  */
1230
1231 static enum tls_model
1232 tls_symbolic_operand_type (rtx addr)
1233 {
1234   enum tls_model tls_kind = TLS_MODEL_NONE;
1235   rtx sym, addend;
1236
1237   if (GET_CODE (addr) == CONST)
1238     {
1239       split_const (addr, &sym, &addend);
1240       if (GET_CODE (sym) == SYMBOL_REF)
1241         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1242     }
1243   else if (GET_CODE (addr) == SYMBOL_REF)
1244     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1245
1246   return tls_kind;
1247 }
1248
1249 /* We'll allow lo_sum's in addresses in our legitimate addresses
1250    so that combine would take care of combining addresses where
1251    necessary, but for generation purposes, we'll generate the address
1252    as :
1253    RTL                               Absolute
1254    tmp = hi (symbol_ref);            adrp  x1, foo
1255    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1256                                      nop
1257
1258    PIC                               TLS
1259    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1260    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1261                                      bl   __tls_get_addr
1262                                      nop
1263
1264    Load TLS symbol, depending on TLS mechanism and TLS access model.
1265
1266    Global Dynamic - Traditional TLS:
1267    adrp tmp, :tlsgd:imm
1268    add  dest, tmp, #:tlsgd_lo12:imm
1269    bl   __tls_get_addr
1270
1271    Global Dynamic - TLS Descriptors:
1272    adrp dest, :tlsdesc:imm
1273    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1274    add  dest, dest, #:tlsdesc_lo12:imm
1275    blr  tmp
1276    mrs  tp, tpidr_el0
1277    add  dest, dest, tp
1278
1279    Initial Exec:
1280    mrs  tp, tpidr_el0
1281    adrp tmp, :gottprel:imm
1282    ldr  dest, [tmp, #:gottprel_lo12:imm]
1283    add  dest, dest, tp
1284
1285    Local Exec:
1286    mrs  tp, tpidr_el0
1287    add  t0, tp, #:tprel_hi12:imm, lsl #12
1288    add  t0, t0, #:tprel_lo12_nc:imm
1289 */
1290
1291 static void
1292 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1293                                    enum aarch64_symbol_type type)
1294 {
1295   switch (type)
1296     {
1297     case SYMBOL_SMALL_ABSOLUTE:
1298       {
1299         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1300         rtx tmp_reg = dest;
1301         machine_mode mode = GET_MODE (dest);
1302
1303         gcc_assert (mode == Pmode || mode == ptr_mode);
1304
1305         if (can_create_pseudo_p ())
1306           tmp_reg = gen_reg_rtx (mode);
1307
1308         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1309         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1310         return;
1311       }
1312
1313     case SYMBOL_TINY_ABSOLUTE:
1314       emit_insn (gen_rtx_SET (dest, imm));
1315       return;
1316
1317     case SYMBOL_SMALL_GOT_28K:
1318       {
1319         machine_mode mode = GET_MODE (dest);
1320         rtx gp_rtx = pic_offset_table_rtx;
1321         rtx insn;
1322         rtx mem;
1323
1324         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1325            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1326            decide rtx costs, in which case pic_offset_table_rtx is not
1327            initialized.  For that case no need to generate the first adrp
1328            instruction as the final cost for global variable access is
1329            one instruction.  */
1330         if (gp_rtx != NULL)
1331           {
1332             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1333                using the page base as GOT base, the first page may be wasted,
1334                in the worst scenario, there is only 28K space for GOT).
1335
1336                The generate instruction sequence for accessing global variable
1337                is:
1338
1339                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1340
1341                Only one instruction needed. But we must initialize
1342                pic_offset_table_rtx properly.  We generate initialize insn for
1343                every global access, and allow CSE to remove all redundant.
1344
1345                The final instruction sequences will look like the following
1346                for multiply global variables access.
1347
1348                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1349
1350                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1351                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1352                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1353                  ...  */
1354
1355             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1356             crtl->uses_pic_offset_table = 1;
1357             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1358
1359             if (mode != GET_MODE (gp_rtx))
1360              gp_rtx = gen_lowpart (mode, gp_rtx);
1361
1362           }
1363
1364         if (mode == ptr_mode)
1365           {
1366             if (mode == DImode)
1367               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1368             else
1369               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1370
1371             mem = XVECEXP (SET_SRC (insn), 0, 0);
1372           }
1373         else
1374           {
1375             gcc_assert (mode == Pmode);
1376
1377             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1378             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1379           }
1380
1381         /* The operand is expected to be MEM.  Whenever the related insn
1382            pattern changed, above code which calculate mem should be
1383            updated.  */
1384         gcc_assert (GET_CODE (mem) == MEM);
1385         MEM_READONLY_P (mem) = 1;
1386         MEM_NOTRAP_P (mem) = 1;
1387         emit_insn (insn);
1388         return;
1389       }
1390
1391     case SYMBOL_SMALL_GOT_4G:
1392       {
1393         /* In ILP32, the mode of dest can be either SImode or DImode,
1394            while the got entry is always of SImode size.  The mode of
1395            dest depends on how dest is used: if dest is assigned to a
1396            pointer (e.g. in the memory), it has SImode; it may have
1397            DImode if dest is dereferenced to access the memeory.
1398            This is why we have to handle three different ldr_got_small
1399            patterns here (two patterns for ILP32).  */
1400
1401         rtx insn;
1402         rtx mem;
1403         rtx tmp_reg = dest;
1404         machine_mode mode = GET_MODE (dest);
1405
1406         if (can_create_pseudo_p ())
1407           tmp_reg = gen_reg_rtx (mode);
1408
1409         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1410         if (mode == ptr_mode)
1411           {
1412             if (mode == DImode)
1413               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1414             else
1415               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1416
1417             mem = XVECEXP (SET_SRC (insn), 0, 0);
1418           }
1419         else
1420           {
1421             gcc_assert (mode == Pmode);
1422
1423             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1424             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1425           }
1426
1427         gcc_assert (GET_CODE (mem) == MEM);
1428         MEM_READONLY_P (mem) = 1;
1429         MEM_NOTRAP_P (mem) = 1;
1430         emit_insn (insn);
1431         return;
1432       }
1433
1434     case SYMBOL_SMALL_TLSGD:
1435       {
1436         rtx_insn *insns;
1437         machine_mode mode = GET_MODE (dest);
1438         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1439
1440         start_sequence ();
1441         if (TARGET_ILP32)
1442           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1443         else
1444           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1445         insns = get_insns ();
1446         end_sequence ();
1447
1448         RTL_CONST_CALL_P (insns) = 1;
1449         emit_libcall_block (insns, dest, result, imm);
1450         return;
1451       }
1452
1453     case SYMBOL_SMALL_TLSDESC:
1454       {
1455         machine_mode mode = GET_MODE (dest);
1456         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1457         rtx tp;
1458
1459         gcc_assert (mode == Pmode || mode == ptr_mode);
1460
1461         /* In ILP32, the got entry is always of SImode size.  Unlike
1462            small GOT, the dest is fixed at reg 0.  */
1463         if (TARGET_ILP32)
1464           emit_insn (gen_tlsdesc_small_si (imm));
1465         else
1466           emit_insn (gen_tlsdesc_small_di (imm));
1467         tp = aarch64_load_tp (NULL);
1468
1469         if (mode != Pmode)
1470           tp = gen_lowpart (mode, tp);
1471
1472         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1473         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1474         return;
1475       }
1476
1477     case SYMBOL_SMALL_TLSIE:
1478       {
1479         /* In ILP32, the mode of dest can be either SImode or DImode,
1480            while the got entry is always of SImode size.  The mode of
1481            dest depends on how dest is used: if dest is assigned to a
1482            pointer (e.g. in the memory), it has SImode; it may have
1483            DImode if dest is dereferenced to access the memeory.
1484            This is why we have to handle three different tlsie_small
1485            patterns here (two patterns for ILP32).  */
1486         machine_mode mode = GET_MODE (dest);
1487         rtx tmp_reg = gen_reg_rtx (mode);
1488         rtx tp = aarch64_load_tp (NULL);
1489
1490         if (mode == ptr_mode)
1491           {
1492             if (mode == DImode)
1493               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1494             else
1495               {
1496                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1497                 tp = gen_lowpart (mode, tp);
1498               }
1499           }
1500         else
1501           {
1502             gcc_assert (mode == Pmode);
1503             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1504           }
1505
1506         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1507         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1508         return;
1509       }
1510
1511     case SYMBOL_TLSLE12:
1512     case SYMBOL_TLSLE24:
1513     case SYMBOL_TLSLE32:
1514     case SYMBOL_TLSLE48:
1515       {
1516         machine_mode mode = GET_MODE (dest);
1517         rtx tp = aarch64_load_tp (NULL);
1518
1519         if (mode != Pmode)
1520           tp = gen_lowpart (mode, tp);
1521
1522         switch (type)
1523           {
1524           case SYMBOL_TLSLE12:
1525             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1526                         (dest, tp, imm));
1527             break;
1528           case SYMBOL_TLSLE24:
1529             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1530                         (dest, tp, imm));
1531           break;
1532           case SYMBOL_TLSLE32:
1533             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1534                         (dest, imm));
1535             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1536                         (dest, dest, tp));
1537           break;
1538           case SYMBOL_TLSLE48:
1539             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1540                         (dest, imm));
1541             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1542                         (dest, dest, tp));
1543             break;
1544           default:
1545             gcc_unreachable ();
1546           }
1547
1548         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1549         return;
1550       }
1551
1552     case SYMBOL_TINY_GOT:
1553       emit_insn (gen_ldr_got_tiny (dest, imm));
1554       return;
1555
1556     case SYMBOL_TINY_TLSIE:
1557       {
1558         machine_mode mode = GET_MODE (dest);
1559         rtx tp = aarch64_load_tp (NULL);
1560
1561         if (mode == ptr_mode)
1562           {
1563             if (mode == DImode)
1564               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1565             else
1566               {
1567                 tp = gen_lowpart (mode, tp);
1568                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1569               }
1570           }
1571         else
1572           {
1573             gcc_assert (mode == Pmode);
1574             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1575           }
1576
1577         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1578         return;
1579       }
1580
1581     default:
1582       gcc_unreachable ();
1583     }
1584 }
1585
1586 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1587    handle all moves if !can_create_pseudo_p ().  The distinction is
1588    important because, unlike emit_move_insn, the move expanders know
1589    how to force Pmode objects into the constant pool even when the
1590    constant pool address is not itself legitimate.  */
1591 static rtx
1592 aarch64_emit_move (rtx dest, rtx src)
1593 {
1594   return (can_create_pseudo_p ()
1595           ? emit_move_insn (dest, src)
1596           : emit_move_insn_1 (dest, src));
1597 }
1598
1599 /* Split a 128-bit move operation into two 64-bit move operations,
1600    taking care to handle partial overlap of register to register
1601    copies.  Special cases are needed when moving between GP regs and
1602    FP regs.  SRC can be a register, constant or memory; DST a register
1603    or memory.  If either operand is memory it must not have any side
1604    effects.  */
1605 void
1606 aarch64_split_128bit_move (rtx dst, rtx src)
1607 {
1608   rtx dst_lo, dst_hi;
1609   rtx src_lo, src_hi;
1610
1611   machine_mode mode = GET_MODE (dst);
1612
1613   gcc_assert (mode == TImode || mode == TFmode);
1614   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1615   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1616
1617   if (REG_P (dst) && REG_P (src))
1618     {
1619       int src_regno = REGNO (src);
1620       int dst_regno = REGNO (dst);
1621
1622       /* Handle FP <-> GP regs.  */
1623       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1624         {
1625           src_lo = gen_lowpart (word_mode, src);
1626           src_hi = gen_highpart (word_mode, src);
1627
1628           if (mode == TImode)
1629             {
1630               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1631               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1632             }
1633           else
1634             {
1635               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1636               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1637             }
1638           return;
1639         }
1640       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1641         {
1642           dst_lo = gen_lowpart (word_mode, dst);
1643           dst_hi = gen_highpart (word_mode, dst);
1644
1645           if (mode == TImode)
1646             {
1647               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1648               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1649             }
1650           else
1651             {
1652               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1653               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1654             }
1655           return;
1656         }
1657     }
1658
1659   dst_lo = gen_lowpart (word_mode, dst);
1660   dst_hi = gen_highpart (word_mode, dst);
1661   src_lo = gen_lowpart (word_mode, src);
1662   src_hi = gen_highpart_mode (word_mode, mode, src);
1663
1664   /* At most one pairing may overlap.  */
1665   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1666     {
1667       aarch64_emit_move (dst_hi, src_hi);
1668       aarch64_emit_move (dst_lo, src_lo);
1669     }
1670   else
1671     {
1672       aarch64_emit_move (dst_lo, src_lo);
1673       aarch64_emit_move (dst_hi, src_hi);
1674     }
1675 }
1676
1677 bool
1678 aarch64_split_128bit_move_p (rtx dst, rtx src)
1679 {
1680   return (! REG_P (src)
1681           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1682 }
1683
1684 /* Split a complex SIMD combine.  */
1685
1686 void
1687 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1688 {
1689   machine_mode src_mode = GET_MODE (src1);
1690   machine_mode dst_mode = GET_MODE (dst);
1691
1692   gcc_assert (VECTOR_MODE_P (dst_mode));
1693   gcc_assert (register_operand (dst, dst_mode)
1694               && register_operand (src1, src_mode)
1695               && register_operand (src2, src_mode));
1696
1697   rtx (*gen) (rtx, rtx, rtx);
1698
1699   switch (src_mode)
1700     {
1701     case E_V8QImode:
1702       gen = gen_aarch64_simd_combinev8qi;
1703       break;
1704     case E_V4HImode:
1705       gen = gen_aarch64_simd_combinev4hi;
1706       break;
1707     case E_V2SImode:
1708       gen = gen_aarch64_simd_combinev2si;
1709       break;
1710     case E_V4HFmode:
1711       gen = gen_aarch64_simd_combinev4hf;
1712       break;
1713     case E_V2SFmode:
1714       gen = gen_aarch64_simd_combinev2sf;
1715       break;
1716     case E_DImode:
1717       gen = gen_aarch64_simd_combinedi;
1718       break;
1719     case E_DFmode:
1720       gen = gen_aarch64_simd_combinedf;
1721       break;
1722     default:
1723       gcc_unreachable ();
1724     }
1725
1726   emit_insn (gen (dst, src1, src2));
1727   return;
1728 }
1729
1730 /* Split a complex SIMD move.  */
1731
1732 void
1733 aarch64_split_simd_move (rtx dst, rtx src)
1734 {
1735   machine_mode src_mode = GET_MODE (src);
1736   machine_mode dst_mode = GET_MODE (dst);
1737
1738   gcc_assert (VECTOR_MODE_P (dst_mode));
1739
1740   if (REG_P (dst) && REG_P (src))
1741     {
1742       rtx (*gen) (rtx, rtx);
1743
1744       gcc_assert (VECTOR_MODE_P (src_mode));
1745
1746       switch (src_mode)
1747         {
1748         case E_V16QImode:
1749           gen = gen_aarch64_split_simd_movv16qi;
1750           break;
1751         case E_V8HImode:
1752           gen = gen_aarch64_split_simd_movv8hi;
1753           break;
1754         case E_V4SImode:
1755           gen = gen_aarch64_split_simd_movv4si;
1756           break;
1757         case E_V2DImode:
1758           gen = gen_aarch64_split_simd_movv2di;
1759           break;
1760         case E_V8HFmode:
1761           gen = gen_aarch64_split_simd_movv8hf;
1762           break;
1763         case E_V4SFmode:
1764           gen = gen_aarch64_split_simd_movv4sf;
1765           break;
1766         case E_V2DFmode:
1767           gen = gen_aarch64_split_simd_movv2df;
1768           break;
1769         default:
1770           gcc_unreachable ();
1771         }
1772
1773       emit_insn (gen (dst, src));
1774       return;
1775     }
1776 }
1777
1778 bool
1779 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1780                               machine_mode ymode, rtx y)
1781 {
1782   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1783   gcc_assert (r != NULL);
1784   return rtx_equal_p (x, r);
1785 }
1786
1787
1788 static rtx
1789 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1790 {
1791   if (can_create_pseudo_p ())
1792     return force_reg (mode, value);
1793   else
1794     {
1795       x = aarch64_emit_move (x, value);
1796       return x;
1797     }
1798 }
1799
1800
1801 static rtx
1802 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1803 {
1804   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1805     {
1806       rtx high;
1807       /* Load the full offset into a register.  This
1808          might be improvable in the future.  */
1809       high = GEN_INT (offset);
1810       offset = 0;
1811       high = aarch64_force_temporary (mode, temp, high);
1812       reg = aarch64_force_temporary (mode, temp,
1813                                      gen_rtx_PLUS (mode, high, reg));
1814     }
1815   return plus_constant (mode, reg, offset);
1816 }
1817
1818 static int
1819 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1820                                 machine_mode mode)
1821 {
1822   int i;
1823   unsigned HOST_WIDE_INT val, val2, mask;
1824   int one_match, zero_match;
1825   int num_insns;
1826
1827   val = INTVAL (imm);
1828
1829   if (aarch64_move_imm (val, mode))
1830     {
1831       if (generate)
1832         emit_insn (gen_rtx_SET (dest, imm));
1833       return 1;
1834     }
1835
1836   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1837      (with XXXX non-zero). In that case check to see if the move can be done in
1838      a smaller mode.  */
1839   val2 = val & 0xffffffff;
1840   if (mode == DImode
1841       && aarch64_move_imm (val2, SImode)
1842       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
1843     {
1844       if (generate)
1845         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1846
1847       /* Check if we have to emit a second instruction by checking to see
1848          if any of the upper 32 bits of the original DI mode value is set.  */
1849       if (val == val2)
1850         return 1;
1851
1852       i = (val >> 48) ? 48 : 32;
1853
1854       if (generate)
1855          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1856                                     GEN_INT ((val >> i) & 0xffff)));
1857
1858       return 2;
1859     }
1860
1861   if ((val >> 32) == 0 || mode == SImode)
1862     {
1863       if (generate)
1864         {
1865           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1866           if (mode == SImode)
1867             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1868                                        GEN_INT ((val >> 16) & 0xffff)));
1869           else
1870             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1871                                        GEN_INT ((val >> 16) & 0xffff)));
1872         }
1873       return 2;
1874     }
1875
1876   /* Remaining cases are all for DImode.  */
1877
1878   mask = 0xffff;
1879   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1880     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1881   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1882     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1883
1884   if (zero_match != 2 && one_match != 2)
1885     {
1886       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1887          For a 64-bit bitmask try whether changing 16 bits to all ones or
1888          zeroes creates a valid bitmask.  To check any repeated bitmask,
1889          try using 16 bits from the other 32-bit half of val.  */
1890
1891       for (i = 0; i < 64; i += 16, mask <<= 16)
1892         {
1893           val2 = val & ~mask;
1894           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1895             break;
1896           val2 = val | mask;
1897           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1898             break;
1899           val2 = val2 & ~mask;
1900           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1901           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1902             break;
1903         }
1904       if (i != 64)
1905         {
1906           if (generate)
1907             {
1908               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1909               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1910                                          GEN_INT ((val >> i) & 0xffff)));
1911             }
1912           return 2;
1913         }
1914     }
1915
1916   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1917      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1918      otherwise skip zero bits.  */
1919
1920   num_insns = 1;
1921   mask = 0xffff;
1922   val2 = one_match > zero_match ? ~val : val;
1923   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1924
1925   if (generate)
1926     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1927                                            ? (val | ~(mask << i))
1928                                            : (val & (mask << i)))));
1929   for (i += 16; i < 64; i += 16)
1930     {
1931       if ((val2 & (mask << i)) == 0)
1932         continue;
1933       if (generate)
1934         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1935                                    GEN_INT ((val >> i) & 0xffff)));
1936       num_insns ++;
1937     }
1938
1939   return num_insns;
1940 }
1941
1942
1943 void
1944 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1945 {
1946   machine_mode mode = GET_MODE (dest);
1947
1948   gcc_assert (mode == SImode || mode == DImode);
1949
1950   /* Check on what type of symbol it is.  */
1951   if (GET_CODE (imm) == SYMBOL_REF
1952       || GET_CODE (imm) == LABEL_REF
1953       || GET_CODE (imm) == CONST)
1954     {
1955       rtx mem, base, offset;
1956       enum aarch64_symbol_type sty;
1957
1958       /* If we have (const (plus symbol offset)), separate out the offset
1959          before we start classifying the symbol.  */
1960       split_const (imm, &base, &offset);
1961
1962       sty = aarch64_classify_symbol (base, offset);
1963       switch (sty)
1964         {
1965         case SYMBOL_FORCE_TO_MEM:
1966           if (offset != const0_rtx
1967               && targetm.cannot_force_const_mem (mode, imm))
1968             {
1969               gcc_assert (can_create_pseudo_p ());
1970               base = aarch64_force_temporary (mode, dest, base);
1971               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1972               aarch64_emit_move (dest, base);
1973               return;
1974             }
1975
1976           mem = force_const_mem (ptr_mode, imm);
1977           gcc_assert (mem);
1978
1979           /* If we aren't generating PC relative literals, then
1980              we need to expand the literal pool access carefully.
1981              This is something that needs to be done in a number
1982              of places, so could well live as a separate function.  */
1983           if (!aarch64_pcrelative_literal_loads)
1984             {
1985               gcc_assert (can_create_pseudo_p ());
1986               base = gen_reg_rtx (ptr_mode);
1987               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1988               if (ptr_mode != Pmode)
1989                 base = convert_memory_address (Pmode, base);
1990               mem = gen_rtx_MEM (ptr_mode, base);
1991             }
1992
1993           if (mode != ptr_mode)
1994             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1995
1996           emit_insn (gen_rtx_SET (dest, mem));
1997
1998           return;
1999
2000         case SYMBOL_SMALL_TLSGD:
2001         case SYMBOL_SMALL_TLSDESC:
2002         case SYMBOL_SMALL_TLSIE:
2003         case SYMBOL_SMALL_GOT_28K:
2004         case SYMBOL_SMALL_GOT_4G:
2005         case SYMBOL_TINY_GOT:
2006         case SYMBOL_TINY_TLSIE:
2007           if (offset != const0_rtx)
2008             {
2009               gcc_assert(can_create_pseudo_p ());
2010               base = aarch64_force_temporary (mode, dest, base);
2011               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
2012               aarch64_emit_move (dest, base);
2013               return;
2014             }
2015           /* FALLTHRU */
2016
2017         case SYMBOL_SMALL_ABSOLUTE:
2018         case SYMBOL_TINY_ABSOLUTE:
2019         case SYMBOL_TLSLE12:
2020         case SYMBOL_TLSLE24:
2021         case SYMBOL_TLSLE32:
2022         case SYMBOL_TLSLE48:
2023           aarch64_load_symref_appropriately (dest, imm, sty);
2024           return;
2025
2026         default:
2027           gcc_unreachable ();
2028         }
2029     }
2030
2031   if (!CONST_INT_P (imm))
2032     {
2033       if (GET_CODE (imm) == HIGH)
2034         emit_insn (gen_rtx_SET (dest, imm));
2035       else
2036         {
2037           rtx mem = force_const_mem (mode, imm);
2038           gcc_assert (mem);
2039           emit_insn (gen_rtx_SET (dest, mem));
2040         }
2041
2042       return;
2043     }
2044
2045   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
2046 }
2047
2048 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
2049    temporary value if necessary.  FRAME_RELATED_P should be true if
2050    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2051    to the generated instructions.  If SCRATCHREG is known to hold
2052    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2053    immediate again.
2054
2055    Since this function may be used to adjust the stack pointer, we must
2056    ensure that it cannot cause transient stack deallocation (for example
2057    by first incrementing SP and then decrementing when adjusting by a
2058    large immediate).  */
2059
2060 static void
2061 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
2062                                HOST_WIDE_INT delta, bool frame_related_p,
2063                                bool emit_move_imm)
2064 {
2065   HOST_WIDE_INT mdelta = abs_hwi (delta);
2066   rtx this_rtx = gen_rtx_REG (mode, regnum);
2067   rtx_insn *insn;
2068
2069   if (!mdelta)
2070     return;
2071
2072   /* Single instruction adjustment.  */
2073   if (aarch64_uimm12_shift (mdelta))
2074     {
2075       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2076       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2077       return;
2078     }
2079
2080   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2081      Only do this if mdelta is not a 16-bit move as adjusting using a move
2082      is better.  */
2083   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2084     {
2085       HOST_WIDE_INT low_off = mdelta & 0xfff;
2086
2087       low_off = delta < 0 ? -low_off : low_off;
2088       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2089       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2090       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2091       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2092       return;
2093     }
2094
2095   /* Emit a move immediate if required and an addition/subtraction.  */
2096   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2097   if (emit_move_imm)
2098     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2099   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2100                               : gen_add2_insn (this_rtx, scratch_rtx));
2101   if (frame_related_p)
2102     {
2103       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2104       rtx adj = plus_constant (mode, this_rtx, delta);
2105       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2106     }
2107 }
2108
2109 static inline void
2110 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2111                       HOST_WIDE_INT delta)
2112 {
2113   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2114 }
2115
2116 static inline void
2117 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2118 {
2119   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2120                                  true, emit_move_imm);
2121 }
2122
2123 static inline void
2124 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2125 {
2126   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2127                                  frame_related_p, true);
2128 }
2129
2130 static bool
2131 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2132                                  tree exp ATTRIBUTE_UNUSED)
2133 {
2134   /* Currently, always true.  */
2135   return true;
2136 }
2137
2138 /* Implement TARGET_PASS_BY_REFERENCE.  */
2139
2140 static bool
2141 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2142                            machine_mode mode,
2143                            const_tree type,
2144                            bool named ATTRIBUTE_UNUSED)
2145 {
2146   HOST_WIDE_INT size;
2147   machine_mode dummymode;
2148   int nregs;
2149
2150   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2151   size = (mode == BLKmode && type)
2152     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2153
2154   /* Aggregates are passed by reference based on their size.  */
2155   if (type && AGGREGATE_TYPE_P (type))
2156     {
2157       size = int_size_in_bytes (type);
2158     }
2159
2160   /* Variable sized arguments are always returned by reference.  */
2161   if (size < 0)
2162     return true;
2163
2164   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2165   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2166                                                &dummymode, &nregs,
2167                                                NULL))
2168     return false;
2169
2170   /* Arguments which are variable sized or larger than 2 registers are
2171      passed by reference unless they are a homogenous floating point
2172      aggregate.  */
2173   return size > 2 * UNITS_PER_WORD;
2174 }
2175
2176 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2177 static bool
2178 aarch64_return_in_msb (const_tree valtype)
2179 {
2180   machine_mode dummy_mode;
2181   int dummy_int;
2182
2183   /* Never happens in little-endian mode.  */
2184   if (!BYTES_BIG_ENDIAN)
2185     return false;
2186
2187   /* Only composite types smaller than or equal to 16 bytes can
2188      be potentially returned in registers.  */
2189   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2190       || int_size_in_bytes (valtype) <= 0
2191       || int_size_in_bytes (valtype) > 16)
2192     return false;
2193
2194   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2195      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2196      is always passed/returned in the least significant bits of fp/simd
2197      register(s).  */
2198   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2199                                                &dummy_mode, &dummy_int, NULL))
2200     return false;
2201
2202   return true;
2203 }
2204
2205 /* Implement TARGET_FUNCTION_VALUE.
2206    Define how to find the value returned by a function.  */
2207
2208 static rtx
2209 aarch64_function_value (const_tree type, const_tree func,
2210                         bool outgoing ATTRIBUTE_UNUSED)
2211 {
2212   machine_mode mode;
2213   int unsignedp;
2214   int count;
2215   machine_mode ag_mode;
2216
2217   mode = TYPE_MODE (type);
2218   if (INTEGRAL_TYPE_P (type))
2219     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2220
2221   if (aarch64_return_in_msb (type))
2222     {
2223       HOST_WIDE_INT size = int_size_in_bytes (type);
2224
2225       if (size % UNITS_PER_WORD != 0)
2226         {
2227           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2228           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2229         }
2230     }
2231
2232   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2233                                                &ag_mode, &count, NULL))
2234     {
2235       if (!aarch64_composite_type_p (type, mode))
2236         {
2237           gcc_assert (count == 1 && mode == ag_mode);
2238           return gen_rtx_REG (mode, V0_REGNUM);
2239         }
2240       else
2241         {
2242           int i;
2243           rtx par;
2244
2245           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2246           for (i = 0; i < count; i++)
2247             {
2248               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2249               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2250                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2251               XVECEXP (par, 0, i) = tmp;
2252             }
2253           return par;
2254         }
2255     }
2256   else
2257     return gen_rtx_REG (mode, R0_REGNUM);
2258 }
2259
2260 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2261    Return true if REGNO is the number of a hard register in which the values
2262    of called function may come back.  */
2263
2264 static bool
2265 aarch64_function_value_regno_p (const unsigned int regno)
2266 {
2267   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2268      of 16-byte return values are: 128-bit integers and 16-byte small
2269      structures (excluding homogeneous floating-point aggregates).  */
2270   if (regno == R0_REGNUM || regno == R1_REGNUM)
2271     return true;
2272
2273   /* Up to four fp/simd registers can return a function value, e.g. a
2274      homogeneous floating-point aggregate having four members.  */
2275   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2276     return TARGET_FLOAT;
2277
2278   return false;
2279 }
2280
2281 /* Implement TARGET_RETURN_IN_MEMORY.
2282
2283    If the type T of the result of a function is such that
2284      void func (T arg)
2285    would require that arg be passed as a value in a register (or set of
2286    registers) according to the parameter passing rules, then the result
2287    is returned in the same registers as would be used for such an
2288    argument.  */
2289
2290 static bool
2291 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2292 {
2293   HOST_WIDE_INT size;
2294   machine_mode ag_mode;
2295   int count;
2296
2297   if (!AGGREGATE_TYPE_P (type)
2298       && TREE_CODE (type) != COMPLEX_TYPE
2299       && TREE_CODE (type) != VECTOR_TYPE)
2300     /* Simple scalar types always returned in registers.  */
2301     return false;
2302
2303   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2304                                                type,
2305                                                &ag_mode,
2306                                                &count,
2307                                                NULL))
2308     return false;
2309
2310   /* Types larger than 2 registers returned in memory.  */
2311   size = int_size_in_bytes (type);
2312   return (size < 0 || size > 2 * UNITS_PER_WORD);
2313 }
2314
2315 static bool
2316 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2317                                const_tree type, int *nregs)
2318 {
2319   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2320   return aarch64_vfp_is_call_or_return_candidate (mode,
2321                                                   type,
2322                                                   &pcum->aapcs_vfp_rmode,
2323                                                   nregs,
2324                                                   NULL);
2325 }
2326
2327 /* Given MODE and TYPE of a function argument, return the alignment in
2328    bits.  The idea is to suppress any stronger alignment requested by
2329    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2330    This is a helper function for local use only.  */
2331
2332 static unsigned int
2333 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2334 {
2335   if (!type)
2336     return GET_MODE_ALIGNMENT (mode);
2337
2338   if (integer_zerop (TYPE_SIZE (type)))
2339     return 0;
2340
2341   gcc_assert (TYPE_MODE (type) == mode);
2342
2343   if (!AGGREGATE_TYPE_P (type))
2344     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2345
2346   if (TREE_CODE (type) == ARRAY_TYPE)
2347     return TYPE_ALIGN (TREE_TYPE (type));
2348
2349   unsigned int alignment = 0;
2350   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2351     if (TREE_CODE (field) == FIELD_DECL)
2352       alignment = std::max (alignment, DECL_ALIGN (field));
2353
2354   return alignment;
2355 }
2356
2357 /* Layout a function argument according to the AAPCS64 rules.  The rule
2358    numbers refer to the rule numbers in the AAPCS64.  */
2359
2360 static void
2361 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2362                     const_tree type,
2363                     bool named ATTRIBUTE_UNUSED)
2364 {
2365   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2366   int ncrn, nvrn, nregs;
2367   bool allocate_ncrn, allocate_nvrn;
2368   HOST_WIDE_INT size;
2369
2370   /* We need to do this once per argument.  */
2371   if (pcum->aapcs_arg_processed)
2372     return;
2373
2374   pcum->aapcs_arg_processed = true;
2375
2376   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2377   size
2378     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2379                 UNITS_PER_WORD);
2380
2381   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2382   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2383                                                  mode,
2384                                                  type,
2385                                                  &nregs);
2386
2387   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2388      The following code thus handles passing by SIMD/FP registers first.  */
2389
2390   nvrn = pcum->aapcs_nvrn;
2391
2392   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2393      and homogenous short-vector aggregates (HVA).  */
2394   if (allocate_nvrn)
2395     {
2396       if (!TARGET_FLOAT)
2397         aarch64_err_no_fpadvsimd (mode, "argument");
2398
2399       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2400         {
2401           pcum->aapcs_nextnvrn = nvrn + nregs;
2402           if (!aarch64_composite_type_p (type, mode))
2403             {
2404               gcc_assert (nregs == 1);
2405               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2406             }
2407           else
2408             {
2409               rtx par;
2410               int i;
2411               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2412               for (i = 0; i < nregs; i++)
2413                 {
2414                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2415                                          V0_REGNUM + nvrn + i);
2416                   tmp = gen_rtx_EXPR_LIST
2417                     (VOIDmode, tmp,
2418                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2419                   XVECEXP (par, 0, i) = tmp;
2420                 }
2421               pcum->aapcs_reg = par;
2422             }
2423           return;
2424         }
2425       else
2426         {
2427           /* C.3 NSRN is set to 8.  */
2428           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2429           goto on_stack;
2430         }
2431     }
2432
2433   ncrn = pcum->aapcs_ncrn;
2434   nregs = size / UNITS_PER_WORD;
2435
2436   /* C6 - C9.  though the sign and zero extension semantics are
2437      handled elsewhere.  This is the case where the argument fits
2438      entirely general registers.  */
2439   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2440     {
2441
2442       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2443
2444       /* C.8 if the argument has an alignment of 16 then the NGRN is
2445          rounded up to the next even number.  */
2446       if (nregs == 2
2447           && ncrn % 2
2448           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2449              comparison is there because for > 16 * BITS_PER_UNIT
2450              alignment nregs should be > 2 and therefore it should be
2451              passed by reference rather than value.  */
2452           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2453         {
2454           ++ncrn;
2455           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2456         }
2457
2458       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2459          A reg is still generated for it, but the caller should be smart
2460          enough not to use it.  */
2461       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2462         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2463       else
2464         {
2465           rtx par;
2466           int i;
2467
2468           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2469           for (i = 0; i < nregs; i++)
2470             {
2471               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2472               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2473                                        GEN_INT (i * UNITS_PER_WORD));
2474               XVECEXP (par, 0, i) = tmp;
2475             }
2476           pcum->aapcs_reg = par;
2477         }
2478
2479       pcum->aapcs_nextncrn = ncrn + nregs;
2480       return;
2481     }
2482
2483   /* C.11  */
2484   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2485
2486   /* The argument is passed on stack; record the needed number of words for
2487      this argument and align the total size if necessary.  */
2488 on_stack:
2489   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2490
2491   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2492     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2493                                        16 / UNITS_PER_WORD);
2494   return;
2495 }
2496
2497 /* Implement TARGET_FUNCTION_ARG.  */
2498
2499 static rtx
2500 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2501                       const_tree type, bool named)
2502 {
2503   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2504   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2505
2506   if (mode == VOIDmode)
2507     return NULL_RTX;
2508
2509   aarch64_layout_arg (pcum_v, mode, type, named);
2510   return pcum->aapcs_reg;
2511 }
2512
2513 void
2514 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2515                            const_tree fntype ATTRIBUTE_UNUSED,
2516                            rtx libname ATTRIBUTE_UNUSED,
2517                            const_tree fndecl ATTRIBUTE_UNUSED,
2518                            unsigned n_named ATTRIBUTE_UNUSED)
2519 {
2520   pcum->aapcs_ncrn = 0;
2521   pcum->aapcs_nvrn = 0;
2522   pcum->aapcs_nextncrn = 0;
2523   pcum->aapcs_nextnvrn = 0;
2524   pcum->pcs_variant = ARM_PCS_AAPCS64;
2525   pcum->aapcs_reg = NULL_RTX;
2526   pcum->aapcs_arg_processed = false;
2527   pcum->aapcs_stack_words = 0;
2528   pcum->aapcs_stack_size = 0;
2529
2530   if (!TARGET_FLOAT
2531       && fndecl && TREE_PUBLIC (fndecl)
2532       && fntype && fntype != error_mark_node)
2533     {
2534       const_tree type = TREE_TYPE (fntype);
2535       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2536       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2537       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2538                                                    &mode, &nregs, NULL))
2539         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2540     }
2541   return;
2542 }
2543
2544 static void
2545 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2546                               machine_mode mode,
2547                               const_tree type,
2548                               bool named)
2549 {
2550   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2551   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2552     {
2553       aarch64_layout_arg (pcum_v, mode, type, named);
2554       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2555                   != (pcum->aapcs_stack_words != 0));
2556       pcum->aapcs_arg_processed = false;
2557       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2558       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2559       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2560       pcum->aapcs_stack_words = 0;
2561       pcum->aapcs_reg = NULL_RTX;
2562     }
2563 }
2564
2565 bool
2566 aarch64_function_arg_regno_p (unsigned regno)
2567 {
2568   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2569           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2570 }
2571
2572 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2573    PARM_BOUNDARY bits of alignment, but will be given anything up
2574    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2575    that both before and after the layout of each argument, the Next
2576    Stacked Argument Address (NSAA) will have a minimum alignment of
2577    8 bytes.  */
2578
2579 static unsigned int
2580 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2581 {
2582   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2583   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2584 }
2585
2586 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2587
2588    Return true if an argument passed on the stack should be padded upwards,
2589    i.e. if the least-significant byte of the stack slot has useful data.
2590
2591    Small aggregate types are placed in the lowest memory address.
2592
2593    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2594
2595 bool
2596 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2597 {
2598   /* On little-endian targets, the least significant byte of every stack
2599      argument is passed at the lowest byte address of the stack slot.  */
2600   if (!BYTES_BIG_ENDIAN)
2601     return true;
2602
2603   /* Otherwise, integral, floating-point and pointer types are padded downward:
2604      the least significant byte of a stack argument is passed at the highest
2605      byte address of the stack slot.  */
2606   if (type
2607       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2608          || POINTER_TYPE_P (type))
2609       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2610     return false;
2611
2612   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2613   return true;
2614 }
2615
2616 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2617
2618    It specifies padding for the last (may also be the only)
2619    element of a block move between registers and memory.  If
2620    assuming the block is in the memory, padding upward means that
2621    the last element is padded after its highest significant byte,
2622    while in downward padding, the last element is padded at the
2623    its least significant byte side.
2624
2625    Small aggregates and small complex types are always padded
2626    upwards.
2627
2628    We don't need to worry about homogeneous floating-point or
2629    short-vector aggregates; their move is not affected by the
2630    padding direction determined here.  Regardless of endianness,
2631    each element of such an aggregate is put in the least
2632    significant bits of a fp/simd register.
2633
2634    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2635    register has useful data, and return the opposite if the most
2636    significant byte does.  */
2637
2638 bool
2639 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2640                      bool first ATTRIBUTE_UNUSED)
2641 {
2642
2643   /* Small composite types are always padded upward.  */
2644   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2645     {
2646       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2647                             : GET_MODE_SIZE (mode));
2648       if (size < 2 * UNITS_PER_WORD)
2649         return true;
2650     }
2651
2652   /* Otherwise, use the default padding.  */
2653   return !BYTES_BIG_ENDIAN;
2654 }
2655
2656 static machine_mode
2657 aarch64_libgcc_cmp_return_mode (void)
2658 {
2659   return SImode;
2660 }
2661
2662 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2663
2664 /* We use the 12-bit shifted immediate arithmetic instructions so values
2665    must be multiple of (1 << 12), i.e. 4096.  */
2666 #define ARITH_FACTOR 4096
2667
2668 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2669 #error Cannot use simple address calculation for stack probing
2670 #endif
2671
2672 /* The pair of scratch registers used for stack probing.  */
2673 #define PROBE_STACK_FIRST_REG  9
2674 #define PROBE_STACK_SECOND_REG 10
2675
2676 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2677    inclusive.  These are offsets from the current stack pointer.  */
2678
2679 static void
2680 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2681 {
2682   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2683
2684   /* See the same assertion on PROBE_INTERVAL above.  */
2685   gcc_assert ((first % ARITH_FACTOR) == 0);
2686
2687   /* See if we have a constant small number of probes to generate.  If so,
2688      that's the easy case.  */
2689   if (size <= PROBE_INTERVAL)
2690     {
2691       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2692
2693       emit_set_insn (reg1,
2694                      plus_constant (Pmode,
2695                                     stack_pointer_rtx, -(first + base)));
2696       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2697     }
2698
2699   /* The run-time loop is made up of 8 insns in the generic case while the
2700      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2701   else if (size <= 4 * PROBE_INTERVAL)
2702     {
2703       HOST_WIDE_INT i, rem;
2704
2705       emit_set_insn (reg1,
2706                      plus_constant (Pmode,
2707                                     stack_pointer_rtx,
2708                                     -(first + PROBE_INTERVAL)));
2709       emit_stack_probe (reg1);
2710
2711       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2712          it exceeds SIZE.  If only two probes are needed, this will not
2713          generate any code.  Then probe at FIRST + SIZE.  */
2714       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2715         {
2716           emit_set_insn (reg1,
2717                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2718           emit_stack_probe (reg1);
2719         }
2720
2721       rem = size - (i - PROBE_INTERVAL);
2722       if (rem > 256)
2723         {
2724           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2725
2726           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2727           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2728         }
2729       else
2730         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2731     }
2732
2733   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2734      extra careful with variables wrapping around because we might be at
2735      the very top (or the very bottom) of the address space and we have
2736      to be able to handle this case properly; in particular, we use an
2737      equality test for the loop condition.  */
2738   else
2739     {
2740       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2741
2742       /* Step 1: round SIZE to the previous multiple of the interval.  */
2743
2744       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2745
2746
2747       /* Step 2: compute initial and final value of the loop counter.  */
2748
2749       /* TEST_ADDR = SP + FIRST.  */
2750       emit_set_insn (reg1,
2751                      plus_constant (Pmode, stack_pointer_rtx, -first));
2752
2753       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2754       HOST_WIDE_INT adjustment = - (first + rounded_size);
2755       if (! aarch64_uimm12_shift (adjustment))
2756         {
2757           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
2758                                           true, Pmode);
2759           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
2760         }
2761       else
2762         {
2763           emit_set_insn (reg2,
2764                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
2765         }
2766
2767       /* Step 3: the loop
2768
2769          do
2770            {
2771              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2772              probe at TEST_ADDR
2773            }
2774          while (TEST_ADDR != LAST_ADDR)
2775
2776          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2777          until it is equal to ROUNDED_SIZE.  */
2778
2779       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2780
2781
2782       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2783          that SIZE is equal to ROUNDED_SIZE.  */
2784
2785       if (size != rounded_size)
2786         {
2787           HOST_WIDE_INT rem = size - rounded_size;
2788
2789           if (rem > 256)
2790             {
2791               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2792
2793               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2794               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2795             }
2796           else
2797             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2798         }
2799     }
2800
2801   /* Make sure nothing is scheduled before we are done.  */
2802   emit_insn (gen_blockage ());
2803 }
2804
2805 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2806    absolute addresses.  */
2807
2808 const char *
2809 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2810 {
2811   static int labelno = 0;
2812   char loop_lab[32];
2813   rtx xops[2];
2814
2815   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2816
2817   /* Loop.  */
2818   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2819
2820   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2821   xops[0] = reg1;
2822   xops[1] = GEN_INT (PROBE_INTERVAL);
2823   output_asm_insn ("sub\t%0, %0, %1", xops);
2824
2825   /* Probe at TEST_ADDR.  */
2826   output_asm_insn ("str\txzr, [%0]", xops);
2827
2828   /* Test if TEST_ADDR == LAST_ADDR.  */
2829   xops[1] = reg2;
2830   output_asm_insn ("cmp\t%0, %1", xops);
2831
2832   /* Branch.  */
2833   fputs ("\tb.ne\t", asm_out_file);
2834   assemble_name_raw (asm_out_file, loop_lab);
2835   fputc ('\n', asm_out_file);
2836
2837   return "";
2838 }
2839
2840 static bool
2841 aarch64_frame_pointer_required (void)
2842 {
2843   /* In aarch64_override_options_after_change
2844      flag_omit_leaf_frame_pointer turns off the frame pointer by
2845      default.  Turn it back on now if we've not got a leaf
2846      function.  */
2847   if (flag_omit_leaf_frame_pointer
2848       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2849     return true;
2850
2851   /* Force a frame pointer for EH returns so the return address is at FP+8.  */
2852   if (crtl->calls_eh_return)
2853     return true;
2854
2855   return false;
2856 }
2857
2858 /* Mark the registers that need to be saved by the callee and calculate
2859    the size of the callee-saved registers area and frame record (both FP
2860    and LR may be omitted).  */
2861 static void
2862 aarch64_layout_frame (void)
2863 {
2864   HOST_WIDE_INT offset = 0;
2865   int regno, last_fp_reg = INVALID_REGNUM;
2866
2867   if (reload_completed && cfun->machine->frame.laid_out)
2868     return;
2869
2870 #define SLOT_NOT_REQUIRED (-2)
2871 #define SLOT_REQUIRED     (-1)
2872
2873   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2874   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2875
2876   /* First mark all the registers that really need to be saved...  */
2877   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2878     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2879
2880   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2881     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2882
2883   /* ... that includes the eh data registers (if needed)...  */
2884   if (crtl->calls_eh_return)
2885     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2886       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2887         = SLOT_REQUIRED;
2888
2889   /* ... and any callee saved register that dataflow says is live.  */
2890   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2891     if (df_regs_ever_live_p (regno)
2892         && (regno == R30_REGNUM
2893             || !call_used_regs[regno]))
2894       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2895
2896   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2897     if (df_regs_ever_live_p (regno)
2898         && !call_used_regs[regno])
2899       {
2900         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2901         last_fp_reg = regno;
2902       }
2903
2904   if (frame_pointer_needed)
2905     {
2906       /* FP and LR are placed in the linkage record.  */
2907       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2908       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2909       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2910       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2911       offset += 2 * UNITS_PER_WORD;
2912     }
2913
2914   /* Now assign stack slots for them.  */
2915   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2916     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2917       {
2918         cfun->machine->frame.reg_offset[regno] = offset;
2919         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2920           cfun->machine->frame.wb_candidate1 = regno;
2921         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2922           cfun->machine->frame.wb_candidate2 = regno;
2923         offset += UNITS_PER_WORD;
2924       }
2925
2926   HOST_WIDE_INT max_int_offset = offset;
2927   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2928   bool has_align_gap = offset != max_int_offset;
2929
2930   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2931     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2932       {
2933         /* If there is an alignment gap between integer and fp callee-saves,
2934            allocate the last fp register to it if possible.  */
2935         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2936           {
2937             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2938             break;
2939           }
2940
2941         cfun->machine->frame.reg_offset[regno] = offset;
2942         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2943           cfun->machine->frame.wb_candidate1 = regno;
2944         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2945                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2946           cfun->machine->frame.wb_candidate2 = regno;
2947         offset += UNITS_PER_WORD;
2948       }
2949
2950   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2951
2952   cfun->machine->frame.saved_regs_size = offset;
2953
2954   HOST_WIDE_INT varargs_and_saved_regs_size
2955     = offset + cfun->machine->frame.saved_varargs_size;
2956
2957   cfun->machine->frame.hard_fp_offset
2958     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2959                 STACK_BOUNDARY / BITS_PER_UNIT);
2960
2961   cfun->machine->frame.frame_size
2962     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2963                 + crtl->outgoing_args_size,
2964                 STACK_BOUNDARY / BITS_PER_UNIT);
2965
2966   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2967
2968   cfun->machine->frame.initial_adjust = 0;
2969   cfun->machine->frame.final_adjust = 0;
2970   cfun->machine->frame.callee_adjust = 0;
2971   cfun->machine->frame.callee_offset = 0;
2972
2973   HOST_WIDE_INT max_push_offset = 0;
2974   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2975     max_push_offset = 512;
2976   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2977     max_push_offset = 256;
2978
2979   if (cfun->machine->frame.frame_size < max_push_offset
2980       && crtl->outgoing_args_size == 0)
2981     {
2982       /* Simple, small frame with no outgoing arguments:
2983          stp reg1, reg2, [sp, -frame_size]!
2984          stp reg3, reg4, [sp, 16]  */
2985       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2986     }
2987   else if ((crtl->outgoing_args_size
2988             + cfun->machine->frame.saved_regs_size < 512)
2989            && !(cfun->calls_alloca
2990                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2991     {
2992       /* Frame with small outgoing arguments:
2993          sub sp, sp, frame_size
2994          stp reg1, reg2, [sp, outgoing_args_size]
2995          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2996       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2997       cfun->machine->frame.callee_offset
2998         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2999     }
3000   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
3001     {
3002       /* Frame with large outgoing arguments but a small local area:
3003          stp reg1, reg2, [sp, -hard_fp_offset]!
3004          stp reg3, reg4, [sp, 16]
3005          sub sp, sp, outgoing_args_size  */
3006       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
3007       cfun->machine->frame.final_adjust
3008         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3009     }
3010   else if (!frame_pointer_needed
3011            && varargs_and_saved_regs_size < max_push_offset)
3012     {
3013       /* Frame with large local area and outgoing arguments (this pushes the
3014          callee-saves first, followed by the locals and outgoing area):
3015          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3016          stp reg3, reg4, [sp, 16]
3017          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
3018       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
3019       cfun->machine->frame.final_adjust
3020         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3021       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
3022       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
3023     }
3024   else
3025     {
3026       /* Frame with large local area and outgoing arguments using frame pointer:
3027          sub sp, sp, hard_fp_offset
3028          stp x29, x30, [sp, 0]
3029          add x29, sp, 0
3030          stp reg3, reg4, [sp, 16]
3031          sub sp, sp, outgoing_args_size  */
3032       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3033       cfun->machine->frame.final_adjust
3034         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3035     }
3036
3037   cfun->machine->frame.laid_out = true;
3038 }
3039
3040 /* Return true if the register REGNO is saved on entry to
3041    the current function.  */
3042
3043 static bool
3044 aarch64_register_saved_on_entry (int regno)
3045 {
3046   return cfun->machine->frame.reg_offset[regno] >= 0;
3047 }
3048
3049 /* Return the next register up from REGNO up to LIMIT for the callee
3050    to save.  */
3051
3052 static unsigned
3053 aarch64_next_callee_save (unsigned regno, unsigned limit)
3054 {
3055   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3056     regno ++;
3057   return regno;
3058 }
3059
3060 /* Push the register number REGNO of mode MODE to the stack with write-back
3061    adjusting the stack by ADJUSTMENT.  */
3062
3063 static void
3064 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3065                            HOST_WIDE_INT adjustment)
3066  {
3067   rtx base_rtx = stack_pointer_rtx;
3068   rtx insn, reg, mem;
3069
3070   reg = gen_rtx_REG (mode, regno);
3071   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3072                             plus_constant (Pmode, base_rtx, -adjustment));
3073   mem = gen_frame_mem (mode, mem);
3074
3075   insn = emit_move_insn (mem, reg);
3076   RTX_FRAME_RELATED_P (insn) = 1;
3077 }
3078
3079 /* Generate and return an instruction to store the pair of registers
3080    REG and REG2 of mode MODE to location BASE with write-back adjusting
3081    the stack location BASE by ADJUSTMENT.  */
3082
3083 static rtx
3084 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3085                           HOST_WIDE_INT adjustment)
3086 {
3087   switch (mode)
3088     {
3089     case E_DImode:
3090       return gen_storewb_pairdi_di (base, base, reg, reg2,
3091                                     GEN_INT (-adjustment),
3092                                     GEN_INT (UNITS_PER_WORD - adjustment));
3093     case E_DFmode:
3094       return gen_storewb_pairdf_di (base, base, reg, reg2,
3095                                     GEN_INT (-adjustment),
3096                                     GEN_INT (UNITS_PER_WORD - adjustment));
3097     default:
3098       gcc_unreachable ();
3099     }
3100 }
3101
3102 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3103    stack pointer by ADJUSTMENT.  */
3104
3105 static void
3106 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3107 {
3108   rtx_insn *insn;
3109   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3110
3111   if (regno2 == INVALID_REGNUM)
3112     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3113
3114   rtx reg1 = gen_rtx_REG (mode, regno1);
3115   rtx reg2 = gen_rtx_REG (mode, regno2);
3116
3117   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3118                                               reg2, adjustment));
3119   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3120   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3121   RTX_FRAME_RELATED_P (insn) = 1;
3122 }
3123
3124 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3125    adjusting it by ADJUSTMENT afterwards.  */
3126
3127 static rtx
3128 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3129                          HOST_WIDE_INT adjustment)
3130 {
3131   switch (mode)
3132     {
3133     case E_DImode:
3134       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3135                                    GEN_INT (UNITS_PER_WORD));
3136     case E_DFmode:
3137       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3138                                    GEN_INT (UNITS_PER_WORD));
3139     default:
3140       gcc_unreachable ();
3141     }
3142 }
3143
3144 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3145    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3146    into CFI_OPS.  */
3147
3148 static void
3149 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3150                   rtx *cfi_ops)
3151 {
3152   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
3153   rtx reg1 = gen_rtx_REG (mode, regno1);
3154
3155   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3156
3157   if (regno2 == INVALID_REGNUM)
3158     {
3159       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3160       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3161       emit_move_insn (reg1, gen_frame_mem (mode, mem));
3162     }
3163   else
3164     {
3165       rtx reg2 = gen_rtx_REG (mode, regno2);
3166       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3167       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3168                                           reg2, adjustment));
3169     }
3170 }
3171
3172 /* Generate and return a store pair instruction of mode MODE to store
3173    register REG1 to MEM1 and register REG2 to MEM2.  */
3174
3175 static rtx
3176 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3177                         rtx reg2)
3178 {
3179   switch (mode)
3180     {
3181     case E_DImode:
3182       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3183
3184     case E_DFmode:
3185       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3186
3187     default:
3188       gcc_unreachable ();
3189     }
3190 }
3191
3192 /* Generate and regurn a load pair isntruction of mode MODE to load register
3193    REG1 from MEM1 and register REG2 from MEM2.  */
3194
3195 static rtx
3196 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3197                        rtx mem2)
3198 {
3199   switch (mode)
3200     {
3201     case E_DImode:
3202       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3203
3204     case E_DFmode:
3205       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3206
3207     default:
3208       gcc_unreachable ();
3209     }
3210 }
3211
3212 /* Return TRUE if return address signing should be enabled for the current
3213    function, otherwise return FALSE.  */
3214
3215 bool
3216 aarch64_return_address_signing_enabled (void)
3217 {
3218   /* This function should only be called after frame laid out.   */
3219   gcc_assert (cfun->machine->frame.laid_out);
3220
3221   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3222      if it's LR is pushed onto stack.  */
3223   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3224           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3225               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3226 }
3227
3228 /* Emit code to save the callee-saved registers from register number START
3229    to LIMIT to the stack at the location starting at offset START_OFFSET,
3230    skipping any write-back candidates if SKIP_WB is true.  */
3231
3232 static void
3233 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3234                            unsigned start, unsigned limit, bool skip_wb)
3235 {
3236   rtx_insn *insn;
3237   unsigned regno;
3238   unsigned regno2;
3239
3240   for (regno = aarch64_next_callee_save (start, limit);
3241        regno <= limit;
3242        regno = aarch64_next_callee_save (regno + 1, limit))
3243     {
3244       rtx reg, mem;
3245       HOST_WIDE_INT offset;
3246
3247       if (skip_wb
3248           && (regno == cfun->machine->frame.wb_candidate1
3249               || regno == cfun->machine->frame.wb_candidate2))
3250         continue;
3251
3252       if (cfun->machine->reg_is_wrapped_separately[regno])
3253        continue;
3254
3255       reg = gen_rtx_REG (mode, regno);
3256       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3257       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3258                                                 offset));
3259
3260       regno2 = aarch64_next_callee_save (regno + 1, limit);
3261
3262       if (regno2 <= limit
3263           && !cfun->machine->reg_is_wrapped_separately[regno2]
3264           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3265               == cfun->machine->frame.reg_offset[regno2]))
3266
3267         {
3268           rtx reg2 = gen_rtx_REG (mode, regno2);
3269           rtx mem2;
3270
3271           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3272           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
3273                                                      offset));
3274           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3275                                                     reg2));
3276
3277           /* The first part of a frame-related parallel insn is
3278              always assumed to be relevant to the frame
3279              calculations; subsequent parts, are only
3280              frame-related if explicitly marked.  */
3281           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3282           regno = regno2;
3283         }
3284       else
3285         insn = emit_move_insn (mem, reg);
3286
3287       RTX_FRAME_RELATED_P (insn) = 1;
3288     }
3289 }
3290
3291 /* Emit code to restore the callee registers of mode MODE from register
3292    number START up to and including LIMIT.  Restore from the stack offset
3293    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3294    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3295
3296 static void
3297 aarch64_restore_callee_saves (machine_mode mode,
3298                               HOST_WIDE_INT start_offset, unsigned start,
3299                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3300 {
3301   rtx base_rtx = stack_pointer_rtx;
3302   unsigned regno;
3303   unsigned regno2;
3304   HOST_WIDE_INT offset;
3305
3306   for (regno = aarch64_next_callee_save (start, limit);
3307        regno <= limit;
3308        regno = aarch64_next_callee_save (regno + 1, limit))
3309     {
3310       if (cfun->machine->reg_is_wrapped_separately[regno])
3311        continue;
3312
3313       rtx reg, mem;
3314
3315       if (skip_wb
3316           && (regno == cfun->machine->frame.wb_candidate1
3317               || regno == cfun->machine->frame.wb_candidate2))
3318         continue;
3319
3320       reg = gen_rtx_REG (mode, regno);
3321       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3322       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3323
3324       regno2 = aarch64_next_callee_save (regno + 1, limit);
3325
3326       if (regno2 <= limit
3327           && !cfun->machine->reg_is_wrapped_separately[regno2]
3328           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3329               == cfun->machine->frame.reg_offset[regno2]))
3330         {
3331           rtx reg2 = gen_rtx_REG (mode, regno2);
3332           rtx mem2;
3333
3334           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3335           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
3336           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3337
3338           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3339           regno = regno2;
3340         }
3341       else
3342         emit_move_insn (reg, mem);
3343       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3344     }
3345 }
3346
3347 static inline bool
3348 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3349                                HOST_WIDE_INT offset)
3350 {
3351   return offset >= -256 && offset < 256;
3352 }
3353
3354 static inline bool
3355 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3356 {
3357   return (offset >= 0
3358           && offset < 4096 * GET_MODE_SIZE (mode)
3359           && offset % GET_MODE_SIZE (mode) == 0);
3360 }
3361
3362 bool
3363 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3364 {
3365   return (offset >= -64 * GET_MODE_SIZE (mode)
3366           && offset < 64 * GET_MODE_SIZE (mode)
3367           && offset % GET_MODE_SIZE (mode) == 0);
3368 }
3369
3370 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3371
3372 static sbitmap
3373 aarch64_get_separate_components (void)
3374 {
3375   aarch64_layout_frame ();
3376
3377   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3378   bitmap_clear (components);
3379
3380   /* The registers we need saved to the frame.  */
3381   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3382     if (aarch64_register_saved_on_entry (regno))
3383       {
3384         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3385         if (!frame_pointer_needed)
3386           offset += cfun->machine->frame.frame_size
3387                     - cfun->machine->frame.hard_fp_offset;
3388         /* Check that we can access the stack slot of the register with one
3389            direct load with no adjustments needed.  */
3390         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3391           bitmap_set_bit (components, regno);
3392       }
3393
3394   /* Don't mess with the hard frame pointer.  */
3395   if (frame_pointer_needed)
3396     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3397
3398   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3399   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3400   /* If aarch64_layout_frame has chosen registers to store/restore with
3401      writeback don't interfere with them to avoid having to output explicit
3402      stack adjustment instructions.  */
3403   if (reg2 != INVALID_REGNUM)
3404     bitmap_clear_bit (components, reg2);
3405   if (reg1 != INVALID_REGNUM)
3406     bitmap_clear_bit (components, reg1);
3407
3408   bitmap_clear_bit (components, LR_REGNUM);
3409   bitmap_clear_bit (components, SP_REGNUM);
3410
3411   return components;
3412 }
3413
3414 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3415
3416 static sbitmap
3417 aarch64_components_for_bb (basic_block bb)
3418 {
3419   bitmap in = DF_LIVE_IN (bb);
3420   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3421   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3422
3423   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3424   bitmap_clear (components);
3425
3426   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3427   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3428     if ((!call_used_regs[regno])
3429        && (bitmap_bit_p (in, regno)
3430            || bitmap_bit_p (gen, regno)
3431            || bitmap_bit_p (kill, regno)))
3432           bitmap_set_bit (components, regno);
3433
3434   return components;
3435 }
3436
3437 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3438    Nothing to do for aarch64.  */
3439
3440 static void
3441 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3442 {
3443 }
3444
3445 /* Return the next set bit in BMP from START onwards.  Return the total number
3446    of bits in BMP if no set bit is found at or after START.  */
3447
3448 static unsigned int
3449 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3450 {
3451   unsigned int nbits = SBITMAP_SIZE (bmp);
3452   if (start == nbits)
3453     return start;
3454
3455   gcc_assert (start < nbits);
3456   for (unsigned int i = start; i < nbits; i++)
3457     if (bitmap_bit_p (bmp, i))
3458       return i;
3459
3460   return nbits;
3461 }
3462
3463 /* Do the work for aarch64_emit_prologue_components and
3464    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3465    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3466    for these components or the epilogue sequence.  That is, it determines
3467    whether we should emit stores or loads and what kind of CFA notes to attach
3468    to the insns.  Otherwise the logic for the two sequences is very
3469    similar.  */
3470
3471 static void
3472 aarch64_process_components (sbitmap components, bool prologue_p)
3473 {
3474   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3475                              ? HARD_FRAME_POINTER_REGNUM
3476                              : STACK_POINTER_REGNUM);
3477
3478   unsigned last_regno = SBITMAP_SIZE (components);
3479   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3480   rtx_insn *insn = NULL;
3481
3482   while (regno != last_regno)
3483     {
3484       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3485          so DFmode for the vector registers is enough.  */
3486       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
3487       rtx reg = gen_rtx_REG (mode, regno);
3488       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3489       if (!frame_pointer_needed)
3490         offset += cfun->machine->frame.frame_size
3491                   - cfun->machine->frame.hard_fp_offset;
3492       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3493       rtx mem = gen_frame_mem (mode, addr);
3494
3495       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3496       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3497       /* No more registers to handle after REGNO.
3498          Emit a single save/restore and exit.  */
3499       if (regno2 == last_regno)
3500         {
3501           insn = emit_insn (set);
3502           RTX_FRAME_RELATED_P (insn) = 1;
3503           if (prologue_p)
3504             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3505           else
3506             add_reg_note (insn, REG_CFA_RESTORE, reg);
3507           break;
3508         }
3509
3510       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3511       /* The next register is not of the same class or its offset is not
3512          mergeable with the current one into a pair.  */
3513       if (!satisfies_constraint_Ump (mem)
3514           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3515           || (offset2 - cfun->machine->frame.reg_offset[regno])
3516                 != GET_MODE_SIZE (mode))
3517         {
3518           insn = emit_insn (set);
3519           RTX_FRAME_RELATED_P (insn) = 1;
3520           if (prologue_p)
3521             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3522           else
3523             add_reg_note (insn, REG_CFA_RESTORE, reg);
3524
3525           regno = regno2;
3526           continue;
3527         }
3528
3529       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3530       rtx reg2 = gen_rtx_REG (mode, regno2);
3531       if (!frame_pointer_needed)
3532         offset2 += cfun->machine->frame.frame_size
3533                   - cfun->machine->frame.hard_fp_offset;
3534       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3535       rtx mem2 = gen_frame_mem (mode, addr2);
3536       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3537                              : gen_rtx_SET (reg2, mem2);
3538
3539       if (prologue_p)
3540         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3541       else
3542         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3543
3544       RTX_FRAME_RELATED_P (insn) = 1;
3545       if (prologue_p)
3546         {
3547           add_reg_note (insn, REG_CFA_OFFSET, set);
3548           add_reg_note (insn, REG_CFA_OFFSET, set2);
3549         }
3550       else
3551         {
3552           add_reg_note (insn, REG_CFA_RESTORE, reg);
3553           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3554         }
3555
3556       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3557     }
3558 }
3559
3560 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3561
3562 static void
3563 aarch64_emit_prologue_components (sbitmap components)
3564 {
3565   aarch64_process_components (components, true);
3566 }
3567
3568 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3569
3570 static void
3571 aarch64_emit_epilogue_components (sbitmap components)
3572 {
3573   aarch64_process_components (components, false);
3574 }
3575
3576 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3577
3578 static void
3579 aarch64_set_handled_components (sbitmap components)
3580 {
3581   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3582     if (bitmap_bit_p (components, regno))
3583       cfun->machine->reg_is_wrapped_separately[regno] = true;
3584 }
3585
3586 /* AArch64 stack frames generated by this compiler look like:
3587
3588         +-------------------------------+
3589         |                               |
3590         |  incoming stack arguments     |
3591         |                               |
3592         +-------------------------------+
3593         |                               | <-- incoming stack pointer (aligned)
3594         |  callee-allocated save area   |
3595         |  for register varargs         |
3596         |                               |
3597         +-------------------------------+
3598         |  local variables              | <-- frame_pointer_rtx
3599         |                               |
3600         +-------------------------------+
3601         |  padding0                     | \
3602         +-------------------------------+  |
3603         |  callee-saved registers       |  | frame.saved_regs_size
3604         +-------------------------------+  |
3605         |  LR'                          |  |
3606         +-------------------------------+  |
3607         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3608         +-------------------------------+
3609         |  dynamic allocation           |
3610         +-------------------------------+
3611         |  padding                      |
3612         +-------------------------------+
3613         |  outgoing stack arguments     | <-- arg_pointer
3614         |                               |
3615         +-------------------------------+
3616         |                               | <-- stack_pointer_rtx (aligned)
3617
3618    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3619    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3620    unchanged.  */
3621
3622 /* Generate the prologue instructions for entry into a function.
3623    Establish the stack frame by decreasing the stack pointer with a
3624    properly calculated size and, if necessary, create a frame record
3625    filled with the values of LR and previous frame pointer.  The
3626    current FP is also set up if it is in use.  */
3627
3628 void
3629 aarch64_expand_prologue (void)
3630 {
3631   aarch64_layout_frame ();
3632
3633   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3634   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3635   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3636   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3637   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3638   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3639   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3640   rtx_insn *insn;
3641
3642   /* Sign return address for functions.  */
3643   if (aarch64_return_address_signing_enabled ())
3644     {
3645       insn = emit_insn (gen_pacisp ());
3646       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3647       RTX_FRAME_RELATED_P (insn) = 1;
3648     }
3649
3650   if (flag_stack_usage_info)
3651     current_function_static_stack_size = frame_size;
3652
3653   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3654     {
3655       if (crtl->is_leaf && !cfun->calls_alloca)
3656         {
3657           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3658             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3659                                             frame_size - STACK_CHECK_PROTECT);
3660         }
3661       else if (frame_size > 0)
3662         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3663     }
3664
3665   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3666
3667   if (callee_adjust != 0)
3668     aarch64_push_regs (reg1, reg2, callee_adjust);
3669
3670   if (frame_pointer_needed)
3671     {
3672       if (callee_adjust == 0)
3673         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3674                                    R30_REGNUM, false);
3675       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3676                                        stack_pointer_rtx,
3677                                        GEN_INT (callee_offset)));
3678       RTX_FRAME_RELATED_P (insn) = 1;
3679       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3680     }
3681
3682   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3683                              callee_adjust != 0 || frame_pointer_needed);
3684   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3685                              callee_adjust != 0 || frame_pointer_needed);
3686   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3687 }
3688
3689 /* Return TRUE if we can use a simple_return insn.
3690
3691    This function checks whether the callee saved stack is empty, which
3692    means no restore actions are need. The pro_and_epilogue will use
3693    this to check whether shrink-wrapping opt is feasible.  */
3694
3695 bool
3696 aarch64_use_return_insn_p (void)
3697 {
3698   if (!reload_completed)
3699     return false;
3700
3701   if (crtl->profile)
3702     return false;
3703
3704   aarch64_layout_frame ();
3705
3706   return cfun->machine->frame.frame_size == 0;
3707 }
3708
3709 /* Generate the epilogue instructions for returning from a function.
3710    This is almost exactly the reverse of the prolog sequence, except
3711    that we need to insert barriers to avoid scheduling loads that read
3712    from a deallocated stack, and we optimize the unwind records by
3713    emitting them all together if possible.  */
3714 void
3715 aarch64_expand_epilogue (bool for_sibcall)
3716 {
3717   aarch64_layout_frame ();
3718
3719   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3720   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3721   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3722   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3723   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3724   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3725   rtx cfi_ops = NULL;
3726   rtx_insn *insn;
3727
3728   /* We need to add memory barrier to prevent read from deallocated stack.  */
3729   bool need_barrier_p = (get_frame_size ()
3730                          + cfun->machine->frame.saved_varargs_size) != 0;
3731
3732   /* Emit a barrier to prevent loads from a deallocated stack.  */
3733   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3734       || crtl->calls_eh_return)
3735     {
3736       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3737       need_barrier_p = false;
3738     }
3739
3740   /* Restore the stack pointer from the frame pointer if it may not
3741      be the same as the stack pointer.  */
3742   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3743     {
3744       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3745                                        hard_frame_pointer_rtx,
3746                                        GEN_INT (-callee_offset)));
3747       /* If writeback is used when restoring callee-saves, the CFA
3748          is restored on the instruction doing the writeback.  */
3749       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3750     }
3751   else
3752     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3753
3754   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3755                                 callee_adjust != 0, &cfi_ops);
3756   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3757                                 callee_adjust != 0, &cfi_ops);
3758
3759   if (need_barrier_p)
3760     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3761
3762   if (callee_adjust != 0)
3763     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3764
3765   if (callee_adjust != 0 || initial_adjust > 65536)
3766     {
3767       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3768       insn = get_last_insn ();
3769       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3770       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3771       RTX_FRAME_RELATED_P (insn) = 1;
3772       cfi_ops = NULL;
3773     }
3774
3775   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3776
3777   if (cfi_ops)
3778     {
3779       /* Emit delayed restores and reset the CFA to be SP.  */
3780       insn = get_last_insn ();
3781       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3782       REG_NOTES (insn) = cfi_ops;
3783       RTX_FRAME_RELATED_P (insn) = 1;
3784     }
3785
3786   /* We prefer to emit the combined return/authenticate instruction RETAA,
3787      however there are three cases in which we must instead emit an explicit
3788      authentication instruction.
3789
3790         1) Sibcalls don't return in a normal way, so if we're about to call one
3791            we must authenticate.
3792
3793         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3794            generating code for !TARGET_ARMV8_3 we can't use it and must
3795            explicitly authenticate.
3796
3797         3) On an eh_return path we make extra stack adjustments to update the
3798            canonical frame address to be the exception handler's CFA.  We want
3799            to authenticate using the CFA of the function which calls eh_return.
3800     */
3801   if (aarch64_return_address_signing_enabled ()
3802       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3803     {
3804       insn = emit_insn (gen_autisp ());
3805       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3806       RTX_FRAME_RELATED_P (insn) = 1;
3807     }
3808
3809   /* Stack adjustment for exception handler.  */
3810   if (crtl->calls_eh_return)
3811     {
3812       /* We need to unwind the stack by the offset computed by
3813          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3814          to be SP; letting the CFA move during this adjustment
3815          is just as correct as retaining the CFA from the body
3816          of the function.  Therefore, do nothing special.  */
3817       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3818     }
3819
3820   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3821   if (!for_sibcall)
3822     emit_jump_insn (ret_rtx);
3823 }
3824
3825 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
3826    normally or return to a previous frame after unwinding.
3827
3828    An EH return uses a single shared return sequence.  The epilogue is
3829    exactly like a normal epilogue except that it has an extra input
3830    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3831    that must be applied after the frame has been destroyed.  An extra label
3832    is inserted before the epilogue which initializes this register to zero,
3833    and this is the entry point for a normal return.
3834
3835    An actual EH return updates the return address, initializes the stack
3836    adjustment and jumps directly into the epilogue (bypassing the zeroing
3837    of the adjustment).  Since the return address is typically saved on the
3838    stack when a function makes a call, the saved LR must be updated outside
3839    the epilogue.
3840
3841    This poses problems as the store is generated well before the epilogue,
3842    so the offset of LR is not known yet.  Also optimizations will remove the
3843    store as it appears dead, even after the epilogue is generated (as the
3844    base or offset for loading LR is different in many cases).
3845
3846    To avoid these problems this implementation forces the frame pointer
3847    in eh_return functions so that the location of LR is fixed and known early.
3848    It also marks the store volatile, so no optimization is permitted to
3849    remove the store.  */
3850 rtx
3851 aarch64_eh_return_handler_rtx (void)
3852 {
3853   rtx tmp = gen_frame_mem (Pmode,
3854     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3855
3856   /* Mark the store volatile, so no optimization is permitted to remove it.  */
3857   MEM_VOLATILE_P (tmp) = true;
3858   return tmp;
3859 }
3860
3861 /* Output code to add DELTA to the first argument, and then jump
3862    to FUNCTION.  Used for C++ multiple inheritance.  */
3863 static void
3864 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3865                          HOST_WIDE_INT delta,
3866                          HOST_WIDE_INT vcall_offset,
3867                          tree function)
3868 {
3869   /* The this pointer is always in x0.  Note that this differs from
3870      Arm where the this pointer maybe bumped to r1 if r0 is required
3871      to return a pointer to an aggregate.  On AArch64 a result value
3872      pointer will be in x8.  */
3873   int this_regno = R0_REGNUM;
3874   rtx this_rtx, temp0, temp1, addr, funexp;
3875   rtx_insn *insn;
3876
3877   reload_completed = 1;
3878   emit_note (NOTE_INSN_PROLOGUE_END);
3879
3880   if (vcall_offset == 0)
3881     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3882   else
3883     {
3884       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3885
3886       this_rtx = gen_rtx_REG (Pmode, this_regno);
3887       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3888       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3889
3890       addr = this_rtx;
3891       if (delta != 0)
3892         {
3893           if (delta >= -256 && delta < 256)
3894             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3895                                        plus_constant (Pmode, this_rtx, delta));
3896           else
3897             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3898         }
3899
3900       if (Pmode == ptr_mode)
3901         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3902       else
3903         aarch64_emit_move (temp0,
3904                            gen_rtx_ZERO_EXTEND (Pmode,
3905                                                 gen_rtx_MEM (ptr_mode, addr)));
3906
3907       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3908           addr = plus_constant (Pmode, temp0, vcall_offset);
3909       else
3910         {
3911           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3912                                           Pmode);
3913           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3914         }
3915
3916       if (Pmode == ptr_mode)
3917         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3918       else
3919         aarch64_emit_move (temp1,
3920                            gen_rtx_SIGN_EXTEND (Pmode,
3921                                                 gen_rtx_MEM (ptr_mode, addr)));
3922
3923       emit_insn (gen_add2_insn (this_rtx, temp1));
3924     }
3925
3926   /* Generate a tail call to the target function.  */
3927   if (!TREE_USED (function))
3928     {
3929       assemble_external (function);
3930       TREE_USED (function) = 1;
3931     }
3932   funexp = XEXP (DECL_RTL (function), 0);
3933   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3934   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3935   SIBLING_CALL_P (insn) = 1;
3936
3937   insn = get_insns ();
3938   shorten_branches (insn);
3939   final_start_function (insn, file, 1);
3940   final (insn, file, 1);
3941   final_end_function ();
3942
3943   /* Stop pretending to be a post-reload pass.  */
3944   reload_completed = 0;
3945 }
3946
3947 static bool
3948 aarch64_tls_referenced_p (rtx x)
3949 {
3950   if (!TARGET_HAVE_TLS)
3951     return false;
3952   subrtx_iterator::array_type array;
3953   FOR_EACH_SUBRTX (iter, array, x, ALL)
3954     {
3955       const_rtx x = *iter;
3956       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3957         return true;
3958       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3959          TLS offsets, not real symbol references.  */
3960       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3961         iter.skip_subrtxes ();
3962     }
3963   return false;
3964 }
3965
3966
3967 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3968    a left shift of 0 or 12 bits.  */
3969 bool
3970 aarch64_uimm12_shift (HOST_WIDE_INT val)
3971 {
3972   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3973           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3974           );
3975 }
3976
3977
3978 /* Return true if val is an immediate that can be loaded into a
3979    register by a MOVZ instruction.  */
3980 static bool
3981 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3982 {
3983   if (GET_MODE_SIZE (mode) > 4)
3984     {
3985       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3986           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3987         return 1;
3988     }
3989   else
3990     {
3991       /* Ignore sign extension.  */
3992       val &= (HOST_WIDE_INT) 0xffffffff;
3993     }
3994   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3995           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3996 }
3997
3998 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3999
4000 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4001   {
4002     0x0000000100000001ull,
4003     0x0001000100010001ull,
4004     0x0101010101010101ull,
4005     0x1111111111111111ull,
4006     0x5555555555555555ull,
4007   };
4008
4009
4010 /* Return true if val is a valid bitmask immediate.  */
4011
4012 bool
4013 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
4014 {
4015   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
4016   int bits;
4017
4018   /* Check for a single sequence of one bits and return quickly if so.
4019      The special cases of all ones and all zeroes returns false.  */
4020   val = (unsigned HOST_WIDE_INT) val_in;
4021   tmp = val + (val & -val);
4022
4023   if (tmp == (tmp & -tmp))
4024     return (val + 1) > 1;
4025
4026   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
4027   if (mode == SImode)
4028     val = (val << 32) | (val & 0xffffffff);
4029
4030   /* Invert if the immediate doesn't start with a zero bit - this means we
4031      only need to search for sequences of one bits.  */
4032   if (val & 1)
4033     val = ~val;
4034
4035   /* Find the first set bit and set tmp to val with the first sequence of one
4036      bits removed.  Return success if there is a single sequence of ones.  */
4037   first_one = val & -val;
4038   tmp = val & (val + first_one);
4039
4040   if (tmp == 0)
4041     return true;
4042
4043   /* Find the next set bit and compute the difference in bit position.  */
4044   next_one = tmp & -tmp;
4045   bits = clz_hwi (first_one) - clz_hwi (next_one);
4046   mask = val ^ tmp;
4047
4048   /* Check the bit position difference is a power of 2, and that the first
4049      sequence of one bits fits within 'bits' bits.  */
4050   if ((mask >> bits) != 0 || bits != (bits & -bits))
4051     return false;
4052
4053   /* Check the sequence of one bits is repeated 64/bits times.  */
4054   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4055 }
4056
4057 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4058    Assumed precondition: VAL_IN Is not zero.  */
4059
4060 unsigned HOST_WIDE_INT
4061 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4062 {
4063   int lowest_bit_set = ctz_hwi (val_in);
4064   int highest_bit_set = floor_log2 (val_in);
4065   gcc_assert (val_in != 0);
4066
4067   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4068           (HOST_WIDE_INT_1U << lowest_bit_set));
4069 }
4070
4071 /* Create constant where bits outside of lowest bit set to highest bit set
4072    are set to 1.  */
4073
4074 unsigned HOST_WIDE_INT
4075 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4076 {
4077   return val_in | ~aarch64_and_split_imm1 (val_in);
4078 }
4079
4080 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4081
4082 bool
4083 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4084 {
4085   if (aarch64_bitmask_imm (val_in, mode))
4086     return false;
4087
4088   if (aarch64_move_imm (val_in, mode))
4089     return false;
4090
4091   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4092
4093   return aarch64_bitmask_imm (imm2, mode);
4094 }
4095
4096 /* Return true if val is an immediate that can be loaded into a
4097    register in a single instruction.  */
4098 bool
4099 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4100 {
4101   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4102     return 1;
4103   return aarch64_bitmask_imm (val, mode);
4104 }
4105
4106 static bool
4107 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4108 {
4109   rtx base, offset;
4110
4111   if (GET_CODE (x) == HIGH)
4112     return true;
4113
4114   split_const (x, &base, &offset);
4115   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4116     {
4117       if (aarch64_classify_symbol (base, offset)
4118           != SYMBOL_FORCE_TO_MEM)
4119         return true;
4120       else
4121         /* Avoid generating a 64-bit relocation in ILP32; leave
4122            to aarch64_expand_mov_immediate to handle it properly.  */
4123         return mode != ptr_mode;
4124     }
4125
4126   return aarch64_tls_referenced_p (x);
4127 }
4128
4129 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4130    The expansion for a table switch is quite expensive due to the number
4131    of instructions, the table lookup and hard to predict indirect jump.
4132    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4133    set, otherwise use tables for > 16 cases as a tradeoff between size and
4134    performance.  When optimizing for size, use the default setting.  */
4135
4136 static unsigned int
4137 aarch64_case_values_threshold (void)
4138 {
4139   /* Use the specified limit for the number of cases before using jump
4140      tables at higher optimization levels.  */
4141   if (optimize > 2
4142       && selected_cpu->tune->max_case_values != 0)
4143     return selected_cpu->tune->max_case_values;
4144   else
4145     return optimize_size ? default_case_values_threshold () : 17;
4146 }
4147
4148 /* Return true if register REGNO is a valid index register.
4149    STRICT_P is true if REG_OK_STRICT is in effect.  */
4150
4151 bool
4152 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4153 {
4154   if (!HARD_REGISTER_NUM_P (regno))
4155     {
4156       if (!strict_p)
4157         return true;
4158
4159       if (!reg_renumber)
4160         return false;
4161
4162       regno = reg_renumber[regno];
4163     }
4164   return GP_REGNUM_P (regno);
4165 }
4166
4167 /* Return true if register REGNO is a valid base register for mode MODE.
4168    STRICT_P is true if REG_OK_STRICT is in effect.  */
4169
4170 bool
4171 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4172 {
4173   if (!HARD_REGISTER_NUM_P (regno))
4174     {
4175       if (!strict_p)
4176         return true;
4177
4178       if (!reg_renumber)
4179         return false;
4180
4181       regno = reg_renumber[regno];
4182     }
4183
4184   /* The fake registers will be eliminated to either the stack or
4185      hard frame pointer, both of which are usually valid base registers.
4186      Reload deals with the cases where the eliminated form isn't valid.  */
4187   return (GP_REGNUM_P (regno)
4188           || regno == SP_REGNUM
4189           || regno == FRAME_POINTER_REGNUM
4190           || regno == ARG_POINTER_REGNUM);
4191 }
4192
4193 /* Return true if X is a valid base register for mode MODE.
4194    STRICT_P is true if REG_OK_STRICT is in effect.  */
4195
4196 static bool
4197 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4198 {
4199   if (!strict_p && GET_CODE (x) == SUBREG)
4200     x = SUBREG_REG (x);
4201
4202   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4203 }
4204
4205 /* Return true if address offset is a valid index.  If it is, fill in INFO
4206    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4207
4208 static bool
4209 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4210                         machine_mode mode, bool strict_p)
4211 {
4212   enum aarch64_address_type type;
4213   rtx index;
4214   int shift;
4215
4216   /* (reg:P) */
4217   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4218       && GET_MODE (x) == Pmode)
4219     {
4220       type = ADDRESS_REG_REG;
4221       index = x;
4222       shift = 0;
4223     }
4224   /* (sign_extend:DI (reg:SI)) */
4225   else if ((GET_CODE (x) == SIGN_EXTEND
4226             || GET_CODE (x) == ZERO_EXTEND)
4227            && GET_MODE (x) == DImode
4228            && GET_MODE (XEXP (x, 0)) == SImode)
4229     {
4230       type = (GET_CODE (x) == SIGN_EXTEND)
4231         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4232       index = XEXP (x, 0);
4233       shift = 0;
4234     }
4235   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4236   else if (GET_CODE (x) == MULT
4237            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4238                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4239            && GET_MODE (XEXP (x, 0)) == DImode
4240            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4241            && CONST_INT_P (XEXP (x, 1)))
4242     {
4243       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4244         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4245       index = XEXP (XEXP (x, 0), 0);
4246       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4247     }
4248   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4249   else if (GET_CODE (x) == ASHIFT
4250            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4251                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4252            && GET_MODE (XEXP (x, 0)) == DImode
4253            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4254            && CONST_INT_P (XEXP (x, 1)))
4255     {
4256       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4257         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4258       index = XEXP (XEXP (x, 0), 0);
4259       shift = INTVAL (XEXP (x, 1));
4260     }
4261   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4262   else if ((GET_CODE (x) == SIGN_EXTRACT
4263             || GET_CODE (x) == ZERO_EXTRACT)
4264            && GET_MODE (x) == DImode
4265            && GET_CODE (XEXP (x, 0)) == MULT
4266            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4267            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4268     {
4269       type = (GET_CODE (x) == SIGN_EXTRACT)
4270         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4271       index = XEXP (XEXP (x, 0), 0);
4272       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4273       if (INTVAL (XEXP (x, 1)) != 32 + shift
4274           || INTVAL (XEXP (x, 2)) != 0)
4275         shift = -1;
4276     }
4277   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4278      (const_int 0xffffffff<<shift)) */
4279   else if (GET_CODE (x) == AND
4280            && GET_MODE (x) == DImode
4281            && GET_CODE (XEXP (x, 0)) == MULT
4282            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4283            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4284            && CONST_INT_P (XEXP (x, 1)))
4285     {
4286       type = ADDRESS_REG_UXTW;
4287       index = XEXP (XEXP (x, 0), 0);
4288       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4289       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4290         shift = -1;
4291     }
4292   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4293   else if ((GET_CODE (x) == SIGN_EXTRACT
4294             || GET_CODE (x) == ZERO_EXTRACT)
4295            && GET_MODE (x) == DImode
4296            && GET_CODE (XEXP (x, 0)) == ASHIFT
4297            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4298            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4299     {
4300       type = (GET_CODE (x) == SIGN_EXTRACT)
4301         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4302       index = XEXP (XEXP (x, 0), 0);
4303       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4304       if (INTVAL (XEXP (x, 1)) != 32 + shift
4305           || INTVAL (XEXP (x, 2)) != 0)
4306         shift = -1;
4307     }
4308   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4309      (const_int 0xffffffff<<shift)) */
4310   else if (GET_CODE (x) == AND
4311            && GET_MODE (x) == DImode
4312            && GET_CODE (XEXP (x, 0)) == ASHIFT
4313            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4314            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4315            && CONST_INT_P (XEXP (x, 1)))
4316     {
4317       type = ADDRESS_REG_UXTW;
4318       index = XEXP (XEXP (x, 0), 0);
4319       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4320       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4321         shift = -1;
4322     }
4323   /* (mult:P (reg:P) (const_int scale)) */
4324   else if (GET_CODE (x) == MULT
4325            && GET_MODE (x) == Pmode
4326            && GET_MODE (XEXP (x, 0)) == Pmode
4327            && CONST_INT_P (XEXP (x, 1)))
4328     {
4329       type = ADDRESS_REG_REG;
4330       index = XEXP (x, 0);
4331       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4332     }
4333   /* (ashift:P (reg:P) (const_int shift)) */
4334   else if (GET_CODE (x) == ASHIFT
4335            && GET_MODE (x) == Pmode
4336            && GET_MODE (XEXP (x, 0)) == Pmode
4337            && CONST_INT_P (XEXP (x, 1)))
4338     {
4339       type = ADDRESS_REG_REG;
4340       index = XEXP (x, 0);
4341       shift = INTVAL (XEXP (x, 1));
4342     }
4343   else
4344     return false;
4345
4346   if (GET_CODE (index) == SUBREG)
4347     index = SUBREG_REG (index);
4348
4349   if ((shift == 0 ||
4350        (shift > 0 && shift <= 3
4351         && (1 << shift) == GET_MODE_SIZE (mode)))
4352       && REG_P (index)
4353       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4354     {
4355       info->type = type;
4356       info->offset = index;
4357       info->shift = shift;
4358       return true;
4359     }
4360
4361   return false;
4362 }
4363
4364 /* Return true if MODE is one of the modes for which we
4365    support LDP/STP operations.  */
4366
4367 static bool
4368 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4369 {
4370   return mode == SImode || mode == DImode
4371          || mode == SFmode || mode == DFmode
4372          || (aarch64_vector_mode_supported_p (mode)
4373              && GET_MODE_SIZE (mode) == 8);
4374 }
4375
4376 /* Return true if REGNO is a virtual pointer register, or an eliminable
4377    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4378    include stack_pointer or hard_frame_pointer.  */
4379 static bool
4380 virt_or_elim_regno_p (unsigned regno)
4381 {
4382   return ((regno >= FIRST_VIRTUAL_REGISTER
4383            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4384           || regno == FRAME_POINTER_REGNUM
4385           || regno == ARG_POINTER_REGNUM);
4386 }
4387
4388 /* Return true if X is a valid address for machine mode MODE.  If it is,
4389    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4390    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4391
4392 static bool
4393 aarch64_classify_address (struct aarch64_address_info *info,
4394                           rtx x, machine_mode mode,
4395                           RTX_CODE outer_code, bool strict_p)
4396 {
4397   enum rtx_code code = GET_CODE (x);
4398   rtx op0, op1;
4399
4400   /* On BE, we use load/store pair for all large int mode load/stores.
4401      TI/TFmode may also use a load/store pair.  */
4402   bool load_store_pair_p = (outer_code == PARALLEL
4403                             || mode == TImode
4404                             || mode == TFmode
4405                             || (BYTES_BIG_ENDIAN
4406                                 && aarch64_vect_struct_mode_p (mode)));
4407
4408   bool allow_reg_index_p =
4409     !load_store_pair_p
4410     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4411     && !aarch64_vect_struct_mode_p (mode);
4412
4413   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4414      REG addressing.  */
4415   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4416       && (code != POST_INC && code != REG))
4417     return false;
4418
4419   switch (code)
4420     {
4421     case REG:
4422     case SUBREG:
4423       info->type = ADDRESS_REG_IMM;
4424       info->base = x;
4425       info->offset = const0_rtx;
4426       return aarch64_base_register_rtx_p (x, strict_p);
4427
4428     case PLUS:
4429       op0 = XEXP (x, 0);
4430       op1 = XEXP (x, 1);
4431
4432       if (! strict_p
4433           && REG_P (op0)
4434           && virt_or_elim_regno_p (REGNO (op0))
4435           && CONST_INT_P (op1))
4436         {
4437           info->type = ADDRESS_REG_IMM;
4438           info->base = op0;
4439           info->offset = op1;
4440
4441           return true;
4442         }
4443
4444       if (GET_MODE_SIZE (mode) != 0
4445           && CONST_INT_P (op1)
4446           && aarch64_base_register_rtx_p (op0, strict_p))
4447         {
4448           HOST_WIDE_INT offset = INTVAL (op1);
4449
4450           info->type = ADDRESS_REG_IMM;
4451           info->base = op0;
4452           info->offset = op1;
4453
4454           /* TImode and TFmode values are allowed in both pairs of X
4455              registers and individual Q registers.  The available
4456              address modes are:
4457              X,X: 7-bit signed scaled offset
4458              Q:   9-bit signed offset
4459              We conservatively require an offset representable in either mode.
4460              When performing the check for pairs of X registers i.e.  LDP/STP
4461              pass down DImode since that is the natural size of the LDP/STP
4462              instruction memory accesses.  */
4463           if (mode == TImode || mode == TFmode)
4464             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4465                     && (offset_9bit_signed_unscaled_p (mode, offset)
4466                         || offset_12bit_unsigned_scaled_p (mode, offset)));
4467
4468           /* A 7bit offset check because OImode will emit a ldp/stp
4469              instruction (only big endian will get here).
4470              For ldp/stp instructions, the offset is scaled for the size of a
4471              single element of the pair.  */
4472           if (mode == OImode)
4473             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4474
4475           /* Three 9/12 bit offsets checks because CImode will emit three
4476              ldr/str instructions (only big endian will get here).  */
4477           if (mode == CImode)
4478             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4479                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4480                         || offset_12bit_unsigned_scaled_p (V16QImode,
4481                                                            offset + 32)));
4482
4483           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4484              instructions (only big endian will get here).  */
4485           if (mode == XImode)
4486             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4487                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4488                                                             offset + 32));
4489
4490           if (load_store_pair_p)
4491             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4492                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4493           else
4494             return (offset_9bit_signed_unscaled_p (mode, offset)
4495                     || offset_12bit_unsigned_scaled_p (mode, offset));
4496         }
4497
4498       if (allow_reg_index_p)
4499         {
4500           /* Look for base + (scaled/extended) index register.  */
4501           if (aarch64_base_register_rtx_p (op0, strict_p)
4502               && aarch64_classify_index (info, op1, mode, strict_p))
4503             {
4504               info->base = op0;
4505               return true;
4506             }
4507           if (aarch64_base_register_rtx_p (op1, strict_p)
4508               && aarch64_classify_index (info, op0, mode, strict_p))
4509             {
4510               info->base = op1;
4511               return true;
4512             }
4513         }
4514
4515       return false;
4516
4517     case POST_INC:
4518     case POST_DEC:
4519     case PRE_INC:
4520     case PRE_DEC:
4521       info->type = ADDRESS_REG_WB;
4522       info->base = XEXP (x, 0);
4523       info->offset = NULL_RTX;
4524       return aarch64_base_register_rtx_p (info->base, strict_p);
4525
4526     case POST_MODIFY:
4527     case PRE_MODIFY:
4528       info->type = ADDRESS_REG_WB;
4529       info->base = XEXP (x, 0);
4530       if (GET_CODE (XEXP (x, 1)) == PLUS
4531           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4532           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4533           && aarch64_base_register_rtx_p (info->base, strict_p))
4534         {
4535           HOST_WIDE_INT offset;
4536           info->offset = XEXP (XEXP (x, 1), 1);
4537           offset = INTVAL (info->offset);
4538
4539           /* TImode and TFmode values are allowed in both pairs of X
4540              registers and individual Q registers.  The available
4541              address modes are:
4542              X,X: 7-bit signed scaled offset
4543              Q:   9-bit signed offset
4544              We conservatively require an offset representable in either mode.
4545            */
4546           if (mode == TImode || mode == TFmode)
4547             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4548                     && offset_9bit_signed_unscaled_p (mode, offset));
4549
4550           if (load_store_pair_p)
4551             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4552                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4553           else
4554             return offset_9bit_signed_unscaled_p (mode, offset);
4555         }
4556       return false;
4557
4558     case CONST:
4559     case SYMBOL_REF:
4560     case LABEL_REF:
4561       /* load literal: pc-relative constant pool entry.  Only supported
4562          for SI mode or larger.  */
4563       info->type = ADDRESS_SYMBOLIC;
4564
4565       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4566         {
4567           rtx sym, addend;
4568
4569           split_const (x, &sym, &addend);
4570           return ((GET_CODE (sym) == LABEL_REF
4571                    || (GET_CODE (sym) == SYMBOL_REF
4572                        && CONSTANT_POOL_ADDRESS_P (sym)
4573                        && aarch64_pcrelative_literal_loads)));
4574         }
4575       return false;
4576
4577     case LO_SUM:
4578       info->type = ADDRESS_LO_SUM;
4579       info->base = XEXP (x, 0);
4580       info->offset = XEXP (x, 1);
4581       if (allow_reg_index_p
4582           && aarch64_base_register_rtx_p (info->base, strict_p))
4583         {
4584           rtx sym, offs;
4585           split_const (info->offset, &sym, &offs);
4586           if (GET_CODE (sym) == SYMBOL_REF
4587               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4588             {
4589               /* The symbol and offset must be aligned to the access size.  */
4590               unsigned int align;
4591               unsigned int ref_size;
4592
4593               if (CONSTANT_POOL_ADDRESS_P (sym))
4594                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4595               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4596                 {
4597                   tree exp = SYMBOL_REF_DECL (sym);
4598                   align = TYPE_ALIGN (TREE_TYPE (exp));
4599                   align = CONSTANT_ALIGNMENT (exp, align);
4600                 }
4601               else if (SYMBOL_REF_DECL (sym))
4602                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4603               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4604                        && SYMBOL_REF_BLOCK (sym) != NULL)
4605                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4606               else
4607                 align = BITS_PER_UNIT;
4608
4609               ref_size = GET_MODE_SIZE (mode);
4610               if (ref_size == 0)
4611                 ref_size = GET_MODE_SIZE (DImode);
4612
4613               return ((INTVAL (offs) & (ref_size - 1)) == 0
4614                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4615             }
4616         }
4617       return false;
4618
4619     default:
4620       return false;
4621     }
4622 }
4623
4624 /* Return true if the address X is valid for a PRFM instruction.
4625    STRICT_P is true if we should do strict checking with
4626    aarch64_classify_address.  */
4627
4628 bool
4629 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
4630 {
4631   struct aarch64_address_info addr;
4632
4633   /* PRFM accepts the same addresses as DImode...  */
4634   bool res = aarch64_classify_address (&addr, x, DImode, MEM, strict_p);
4635   if (!res)
4636     return false;
4637
4638   /* ... except writeback forms.  */
4639   return addr.type != ADDRESS_REG_WB;
4640 }
4641
4642 bool
4643 aarch64_symbolic_address_p (rtx x)
4644 {
4645   rtx offset;
4646
4647   split_const (x, &x, &offset);
4648   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4649 }
4650
4651 /* Classify the base of symbolic expression X.  */
4652
4653 enum aarch64_symbol_type
4654 aarch64_classify_symbolic_expression (rtx x)
4655 {
4656   rtx offset;
4657
4658   split_const (x, &x, &offset);
4659   return aarch64_classify_symbol (x, offset);
4660 }
4661
4662
4663 /* Return TRUE if X is a legitimate address for accessing memory in
4664    mode MODE.  */
4665 static bool
4666 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4667 {
4668   struct aarch64_address_info addr;
4669
4670   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4671 }
4672
4673 /* Return TRUE if X is a legitimate address for accessing memory in
4674    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4675    pair operation.  */
4676 bool
4677 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4678                               RTX_CODE outer_code, bool strict_p)
4679 {
4680   struct aarch64_address_info addr;
4681
4682   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4683 }
4684
4685 /* Split an out-of-range address displacement into a base and offset.
4686    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4687    to increase opportunities for sharing the base address of different sizes.
4688    For unaligned accesses and TI/TF mode use the signed 9-bit range.  */
4689 static bool
4690 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4691 {
4692   HOST_WIDE_INT offset = INTVAL (*disp);
4693   HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4694
4695   if (mode == TImode || mode == TFmode
4696       || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4697     base = (offset + 0x100) & ~0x1ff;
4698
4699   *off = GEN_INT (base);
4700   *disp = GEN_INT (offset - base);
4701   return true;
4702 }
4703
4704 /* Return the binary representation of floating point constant VALUE in INTVAL.
4705    If the value cannot be converted, return false without setting INTVAL.
4706    The conversion is done in the given MODE.  */
4707 bool
4708 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
4709 {
4710
4711   /* We make a general exception for 0.  */
4712   if (aarch64_float_const_zero_rtx_p (value))
4713     {
4714       *intval = 0;
4715       return true;
4716     }
4717
4718   machine_mode mode = GET_MODE (value);
4719   if (GET_CODE (value) != CONST_DOUBLE
4720       || !SCALAR_FLOAT_MODE_P (mode)
4721       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
4722       /* Only support up to DF mode.  */
4723       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
4724     return false;
4725
4726   unsigned HOST_WIDE_INT ival = 0;
4727
4728   long res[2];
4729   real_to_target (res,
4730                   CONST_DOUBLE_REAL_VALUE (value),
4731                   REAL_MODE_FORMAT (mode));
4732
4733   if (mode == DFmode)
4734     {
4735       int order = BYTES_BIG_ENDIAN ? 1 : 0;
4736       ival = zext_hwi (res[order], 32);
4737       ival |= (zext_hwi (res[1 - order], 32) << 32);
4738     }
4739   else
4740       ival = zext_hwi (res[0], 32);
4741
4742   *intval = ival;
4743   return true;
4744 }
4745
4746 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4747    single MOV(+MOVK) followed by an FMOV.  */
4748 bool
4749 aarch64_float_const_rtx_p (rtx x)
4750 {
4751   machine_mode mode = GET_MODE (x);
4752   if (mode == VOIDmode)
4753     return false;
4754
4755   /* Determine whether it's cheaper to write float constants as
4756      mov/movk pairs over ldr/adrp pairs.  */
4757   unsigned HOST_WIDE_INT ival;
4758
4759   if (GET_CODE (x) == CONST_DOUBLE
4760       && SCALAR_FLOAT_MODE_P (mode)
4761       && aarch64_reinterpret_float_as_int (x, &ival))
4762     {
4763       machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode (mode);
4764       int num_instr = aarch64_internal_mov_immediate
4765                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
4766       return num_instr < 3;
4767     }
4768
4769   return false;
4770 }
4771
4772 /* Return TRUE if rtx X is immediate constant 0.0 */
4773 bool
4774 aarch64_float_const_zero_rtx_p (rtx x)
4775 {
4776   if (GET_MODE (x) == VOIDmode)
4777     return false;
4778
4779   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4780     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4781   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4782 }
4783
4784 /* Return TRUE if rtx X is immediate constant that fits in a single
4785    MOVI immediate operation.  */
4786 bool
4787 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
4788 {
4789   if (!TARGET_SIMD)
4790      return false;
4791
4792   machine_mode vmode, imode;
4793   unsigned HOST_WIDE_INT ival;
4794
4795   if (GET_CODE (x) == CONST_DOUBLE
4796       && SCALAR_FLOAT_MODE_P (mode))
4797     {
4798       if (!aarch64_reinterpret_float_as_int (x, &ival))
4799         return false;
4800
4801       /* We make a general exception for 0.  */
4802       if (aarch64_float_const_zero_rtx_p (x))
4803         return true;
4804
4805       imode = int_mode_for_mode (mode);
4806     }
4807   else if (GET_CODE (x) == CONST_INT
4808            && SCALAR_INT_MODE_P (mode))
4809     {
4810        imode = mode;
4811        ival = INTVAL (x);
4812     }
4813   else
4814     return false;
4815
4816    /* use a 64 bit mode for everything except for DI/DF mode, where we use
4817      a 128 bit vector mode.  */
4818   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
4819
4820   vmode = aarch64_simd_container_mode (imode, width);
4821   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
4822
4823   return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);
4824 }
4825
4826
4827 /* Return the fixed registers used for condition codes.  */
4828
4829 static bool
4830 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4831 {
4832   *p1 = CC_REGNUM;
4833   *p2 = INVALID_REGNUM;
4834   return true;
4835 }
4836
4837 /* This function is used by the call expanders of the machine description.
4838    RESULT is the register in which the result is returned.  It's NULL for
4839    "call" and "sibcall".
4840    MEM is the location of the function call.
4841    SIBCALL indicates whether this function call is normal call or sibling call.
4842    It will generate different pattern accordingly.  */
4843
4844 void
4845 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
4846 {
4847   rtx call, callee, tmp;
4848   rtvec vec;
4849   machine_mode mode;
4850
4851   gcc_assert (MEM_P (mem));
4852   callee = XEXP (mem, 0);
4853   mode = GET_MODE (callee);
4854   gcc_assert (mode == Pmode);
4855
4856   /* Decide if we should generate indirect calls by loading the
4857      address of the callee into a register before performing
4858      the branch-and-link.  */
4859   if (SYMBOL_REF_P (callee)
4860       ? (aarch64_is_long_call_p (callee)
4861          || aarch64_is_noplt_call_p (callee))
4862       : !REG_P (callee))
4863     XEXP (mem, 0) = force_reg (mode, callee);
4864
4865   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
4866
4867   if (result != NULL_RTX)
4868     call = gen_rtx_SET (result, call);
4869
4870   if (sibcall)
4871     tmp = ret_rtx;
4872   else
4873     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
4874
4875   vec = gen_rtvec (2, call, tmp);
4876   call = gen_rtx_PARALLEL (VOIDmode, vec);
4877
4878   aarch64_emit_call_insn (call);
4879 }
4880
4881 /* Emit call insn with PAT and do aarch64-specific handling.  */
4882
4883 void
4884 aarch64_emit_call_insn (rtx pat)
4885 {
4886   rtx insn = emit_call_insn (pat);
4887
4888   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4889   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4890   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4891 }
4892
4893 machine_mode
4894 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4895 {
4896   /* All floating point compares return CCFP if it is an equality
4897      comparison, and CCFPE otherwise.  */
4898   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4899     {
4900       switch (code)
4901         {
4902         case EQ:
4903         case NE:
4904         case UNORDERED:
4905         case ORDERED:
4906         case UNLT:
4907         case UNLE:
4908         case UNGT:
4909         case UNGE:
4910         case UNEQ:
4911         case LTGT:
4912           return CCFPmode;
4913
4914         case LT:
4915         case LE:
4916         case GT:
4917         case GE:
4918           return CCFPEmode;
4919
4920         default:
4921           gcc_unreachable ();
4922         }
4923     }
4924
4925   /* Equality comparisons of short modes against zero can be performed
4926      using the TST instruction with the appropriate bitmask.  */
4927   if (y == const0_rtx && REG_P (x)
4928       && (code == EQ || code == NE)
4929       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4930     return CC_NZmode;
4931
4932   /* Similarly, comparisons of zero_extends from shorter modes can
4933      be performed using an ANDS with an immediate mask.  */
4934   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4935       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4936       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4937       && (code == EQ || code == NE))
4938     return CC_NZmode;
4939
4940   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4941       && y == const0_rtx
4942       && (code == EQ || code == NE || code == LT || code == GE)
4943       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4944           || GET_CODE (x) == NEG
4945           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4946               && CONST_INT_P (XEXP (x, 2)))))
4947     return CC_NZmode;
4948
4949   /* A compare with a shifted operand.  Because of canonicalization,
4950      the comparison will have to be swapped when we emit the assembly
4951      code.  */
4952   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4953       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
4954       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4955           || GET_CODE (x) == LSHIFTRT
4956           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4957     return CC_SWPmode;
4958
4959   /* Similarly for a negated operand, but we can only do this for
4960      equalities.  */
4961   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4962       && (REG_P (y) || GET_CODE (y) == SUBREG)
4963       && (code == EQ || code == NE)
4964       && GET_CODE (x) == NEG)
4965     return CC_Zmode;
4966
4967   /* A test for unsigned overflow.  */
4968   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4969       && code == NE
4970       && GET_CODE (x) == PLUS
4971       && GET_CODE (y) == ZERO_EXTEND)
4972     return CC_Cmode;
4973
4974   /* For everything else, return CCmode.  */
4975   return CCmode;
4976 }
4977
4978 static int
4979 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
4980
4981 int
4982 aarch64_get_condition_code (rtx x)
4983 {
4984   machine_mode mode = GET_MODE (XEXP (x, 0));
4985   enum rtx_code comp_code = GET_CODE (x);
4986
4987   if (GET_MODE_CLASS (mode) != MODE_CC)
4988     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4989   return aarch64_get_condition_code_1 (mode, comp_code);
4990 }
4991
4992 static int
4993 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
4994 {
4995   switch (mode)
4996     {
4997     case E_CCFPmode:
4998     case E_CCFPEmode:
4999       switch (comp_code)
5000         {
5001         case GE: return AARCH64_GE;
5002         case GT: return AARCH64_GT;
5003         case LE: return AARCH64_LS;
5004         case LT: return AARCH64_MI;
5005         case NE: return AARCH64_NE;
5006         case EQ: return AARCH64_EQ;
5007         case ORDERED: return AARCH64_VC;
5008         case UNORDERED: return AARCH64_VS;
5009         case UNLT: return AARCH64_LT;
5010         case UNLE: return AARCH64_LE;
5011         case UNGT: return AARCH64_HI;
5012         case UNGE: return AARCH64_PL;
5013         default: return -1;
5014         }
5015       break;
5016
5017     case E_CCmode:
5018       switch (comp_code)
5019         {
5020         case NE: return AARCH64_NE;
5021         case EQ: return AARCH64_EQ;
5022         case GE: return AARCH64_GE;
5023         case GT: return AARCH64_GT;
5024         case LE: return AARCH64_LE;
5025         case LT: return AARCH64_LT;
5026         case GEU: return AARCH64_CS;
5027         case GTU: return AARCH64_HI;
5028         case LEU: return AARCH64_LS;
5029         case LTU: return AARCH64_CC;
5030         default: return -1;
5031         }
5032       break;
5033
5034     case E_CC_SWPmode:
5035       switch (comp_code)
5036         {
5037         case NE: return AARCH64_NE;
5038         case EQ: return AARCH64_EQ;
5039         case GE: return AARCH64_LE;
5040         case GT: return AARCH64_LT;
5041         case LE: return AARCH64_GE;
5042         case LT: return AARCH64_GT;
5043         case GEU: return AARCH64_LS;
5044         case GTU: return AARCH64_CC;
5045         case LEU: return AARCH64_CS;
5046         case LTU: return AARCH64_HI;
5047         default: return -1;
5048         }
5049       break;
5050
5051     case E_CC_NZmode:
5052       switch (comp_code)
5053         {
5054         case NE: return AARCH64_NE;
5055         case EQ: return AARCH64_EQ;
5056         case GE: return AARCH64_PL;
5057         case LT: return AARCH64_MI;
5058         default: return -1;
5059         }
5060       break;
5061
5062     case E_CC_Zmode:
5063       switch (comp_code)
5064         {
5065         case NE: return AARCH64_NE;
5066         case EQ: return AARCH64_EQ;
5067         default: return -1;
5068         }
5069       break;
5070
5071     case E_CC_Cmode:
5072       switch (comp_code)
5073         {
5074         case NE: return AARCH64_CS;
5075         case EQ: return AARCH64_CC;
5076         default: return -1;
5077         }
5078       break;
5079
5080     default:
5081       return -1;
5082     }
5083
5084   return -1;
5085 }
5086
5087 bool
5088 aarch64_const_vec_all_same_in_range_p (rtx x,
5089                                   HOST_WIDE_INT minval,
5090                                   HOST_WIDE_INT maxval)
5091 {
5092   HOST_WIDE_INT firstval;
5093   int count, i;
5094
5095   if (GET_CODE (x) != CONST_VECTOR
5096       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
5097     return false;
5098
5099   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
5100   if (firstval < minval || firstval > maxval)
5101     return false;
5102
5103   count = CONST_VECTOR_NUNITS (x);
5104   for (i = 1; i < count; i++)
5105     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
5106       return false;
5107
5108   return true;
5109 }
5110
5111 bool
5112 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
5113 {
5114   return aarch64_const_vec_all_same_in_range_p (x, val, val);
5115 }
5116
5117
5118 /* N Z C V.  */
5119 #define AARCH64_CC_V 1
5120 #define AARCH64_CC_C (1 << 1)
5121 #define AARCH64_CC_Z (1 << 2)
5122 #define AARCH64_CC_N (1 << 3)
5123
5124 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
5125 static const int aarch64_nzcv_codes[] =
5126 {
5127   0,            /* EQ, Z == 1.  */
5128   AARCH64_CC_Z, /* NE, Z == 0.  */
5129   0,            /* CS, C == 1.  */
5130   AARCH64_CC_C, /* CC, C == 0.  */
5131   0,            /* MI, N == 1.  */
5132   AARCH64_CC_N, /* PL, N == 0.  */
5133   0,            /* VS, V == 1.  */
5134   AARCH64_CC_V, /* VC, V == 0.  */
5135   0,            /* HI, C ==1 && Z == 0.  */
5136   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
5137   AARCH64_CC_V, /* GE, N == V.  */
5138   0,            /* LT, N != V.  */
5139   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
5140   0,            /* LE, !(Z == 0 && N == V).  */
5141   0,            /* AL, Any.  */
5142   0             /* NV, Any.  */
5143 };
5144
5145 /* Print operand X to file F in a target specific manner according to CODE.
5146    The acceptable formatting commands given by CODE are:
5147      'c':               An integer or symbol address without a preceding #
5148                         sign.
5149      'e':               Print the sign/zero-extend size as a character 8->b,
5150                         16->h, 32->w.
5151      'p':               Prints N such that 2^N == X (X must be power of 2 and
5152                         const int).
5153      'P':               Print the number of non-zero bits in X (a const_int).
5154      'H':               Print the higher numbered register of a pair (TImode)
5155                         of regs.
5156      'm':               Print a condition (eq, ne, etc).
5157      'M':               Same as 'm', but invert condition.
5158      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
5159      'S/T/U/V':         Print a FP/SIMD register name for a register list.
5160                         The register printed is the FP/SIMD register name
5161                         of X + 0/1/2/3 for S/T/U/V.
5162      'R':               Print a scalar FP/SIMD register name + 1.
5163      'X':               Print bottom 16 bits of integer constant in hex.
5164      'w/x':             Print a general register name or the zero register
5165                         (32-bit or 64-bit).
5166      '0':               Print a normal operand, if it's a general register,
5167                         then we assume DImode.
5168      'k':               Print NZCV for conditional compare instructions.
5169      'A':               Output address constant representing the first
5170                         argument of X, specifying a relocation offset
5171                         if appropriate.
5172      'L':               Output constant address specified by X
5173                         with a relocation offset if appropriate.
5174      'G':               Prints address of X, specifying a PC relative
5175                         relocation mode if appropriate.  */
5176
5177 static void
5178 aarch64_print_operand (FILE *f, rtx x, int code)
5179 {
5180   switch (code)
5181     {
5182     case 'c':
5183       switch (GET_CODE (x))
5184         {
5185         case CONST_INT:
5186           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
5187           break;
5188
5189         case SYMBOL_REF:
5190           output_addr_const (f, x);
5191           break;
5192
5193         case CONST:
5194           if (GET_CODE (XEXP (x, 0)) == PLUS
5195               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5196             {
5197               output_addr_const (f, x);
5198               break;
5199             }
5200           /* Fall through.  */
5201
5202         default:
5203           output_operand_lossage ("Unsupported operand for code '%c'", code);
5204         }
5205       break;
5206
5207     case 'e':
5208       {
5209         int n;
5210
5211         if (!CONST_INT_P (x)
5212             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
5213           {
5214             output_operand_lossage ("invalid operand for '%%%c'", code);
5215             return;
5216           }
5217
5218         switch (n)
5219           {
5220           case 3:
5221             fputc ('b', f);
5222             break;
5223           case 4:
5224             fputc ('h', f);
5225             break;
5226           case 5:
5227             fputc ('w', f);
5228             break;
5229           default:
5230             output_operand_lossage ("invalid operand for '%%%c'", code);
5231             return;
5232           }
5233       }
5234       break;
5235
5236     case 'p':
5237       {
5238         int n;
5239
5240         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5241           {
5242             output_operand_lossage ("invalid operand for '%%%c'", code);
5243             return;
5244           }
5245
5246         asm_fprintf (f, "%d", n);
5247       }
5248       break;
5249
5250     case 'P':
5251       if (!CONST_INT_P (x))
5252         {
5253           output_operand_lossage ("invalid operand for '%%%c'", code);
5254           return;
5255         }
5256
5257       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5258       break;
5259
5260     case 'H':
5261       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5262         {
5263           output_operand_lossage ("invalid operand for '%%%c'", code);
5264           return;
5265         }
5266
5267       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5268       break;
5269
5270     case 'M':
5271     case 'm':
5272       {
5273         int cond_code;
5274         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
5275         if (x == const_true_rtx)
5276           {
5277             if (code == 'M')
5278               fputs ("nv", f);
5279             return;
5280           }
5281
5282         if (!COMPARISON_P (x))
5283           {
5284             output_operand_lossage ("invalid operand for '%%%c'", code);
5285             return;
5286           }
5287
5288         cond_code = aarch64_get_condition_code (x);
5289         gcc_assert (cond_code >= 0);
5290         if (code == 'M')
5291           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5292         fputs (aarch64_condition_codes[cond_code], f);
5293       }
5294       break;
5295
5296     case 'b':
5297     case 'h':
5298     case 's':
5299     case 'd':
5300     case 'q':
5301       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5302         {
5303           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5304           return;
5305         }
5306       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5307       break;
5308
5309     case 'S':
5310     case 'T':
5311     case 'U':
5312     case 'V':
5313       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5314         {
5315           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5316           return;
5317         }
5318       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5319       break;
5320
5321     case 'R':
5322       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5323         {
5324           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5325           return;
5326         }
5327       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5328       break;
5329
5330     case 'X':
5331       if (!CONST_INT_P (x))
5332         {
5333           output_operand_lossage ("invalid operand for '%%%c'", code);
5334           return;
5335         }
5336       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5337       break;
5338
5339     case 'w':
5340     case 'x':
5341       if (x == const0_rtx
5342           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5343         {
5344           asm_fprintf (f, "%czr", code);
5345           break;
5346         }
5347
5348       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5349         {
5350           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5351           break;
5352         }
5353
5354       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5355         {
5356           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5357           break;
5358         }
5359
5360       /* Fall through */
5361
5362     case 0:
5363       if (x == NULL)
5364         {
5365           output_operand_lossage ("missing operand");
5366           return;
5367         }
5368
5369       switch (GET_CODE (x))
5370         {
5371         case REG:
5372           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5373           break;
5374
5375         case MEM:
5376           output_address (GET_MODE (x), XEXP (x, 0));
5377           /* Check all memory references are Pmode - even with ILP32.  */
5378           gcc_assert (GET_MODE (XEXP (x, 0)) == Pmode);
5379           break;
5380
5381         case CONST:
5382         case LABEL_REF:
5383         case SYMBOL_REF:
5384           output_addr_const (asm_out_file, x);
5385           break;
5386
5387         case CONST_INT:
5388           asm_fprintf (f, "%wd", INTVAL (x));
5389           break;
5390
5391         case CONST_VECTOR:
5392           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5393             {
5394               gcc_assert (
5395                   aarch64_const_vec_all_same_in_range_p (x,
5396                                                          HOST_WIDE_INT_MIN,
5397                                                          HOST_WIDE_INT_MAX));
5398               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5399             }
5400           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5401             {
5402               fputc ('0', f);
5403             }
5404           else
5405             gcc_unreachable ();
5406           break;
5407
5408         case CONST_DOUBLE:
5409           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5410              be getting CONST_DOUBLEs holding integers.  */
5411           gcc_assert (GET_MODE (x) != VOIDmode);
5412           if (aarch64_float_const_zero_rtx_p (x))
5413             {
5414               fputc ('0', f);
5415               break;
5416             }
5417           else if (aarch64_float_const_representable_p (x))
5418             {
5419 #define buf_size 20
5420               char float_buf[buf_size] = {'\0'};
5421               real_to_decimal_for_mode (float_buf,
5422                                         CONST_DOUBLE_REAL_VALUE (x),
5423                                         buf_size, buf_size,
5424                                         1, GET_MODE (x));
5425               asm_fprintf (asm_out_file, "%s", float_buf);
5426               break;
5427 #undef buf_size
5428             }
5429           output_operand_lossage ("invalid constant");
5430           return;
5431         default:
5432           output_operand_lossage ("invalid operand");
5433           return;
5434         }
5435       break;
5436
5437     case 'A':
5438       if (GET_CODE (x) == HIGH)
5439         x = XEXP (x, 0);
5440
5441       switch (aarch64_classify_symbolic_expression (x))
5442         {
5443         case SYMBOL_SMALL_GOT_4G:
5444           asm_fprintf (asm_out_file, ":got:");
5445           break;
5446
5447         case SYMBOL_SMALL_TLSGD:
5448           asm_fprintf (asm_out_file, ":tlsgd:");
5449           break;
5450
5451         case SYMBOL_SMALL_TLSDESC:
5452           asm_fprintf (asm_out_file, ":tlsdesc:");
5453           break;
5454
5455         case SYMBOL_SMALL_TLSIE:
5456           asm_fprintf (asm_out_file, ":gottprel:");
5457           break;
5458
5459         case SYMBOL_TLSLE24:
5460           asm_fprintf (asm_out_file, ":tprel:");
5461           break;
5462
5463         case SYMBOL_TINY_GOT:
5464           gcc_unreachable ();
5465           break;
5466
5467         default:
5468           break;
5469         }
5470       output_addr_const (asm_out_file, x);
5471       break;
5472
5473     case 'L':
5474       switch (aarch64_classify_symbolic_expression (x))
5475         {
5476         case SYMBOL_SMALL_GOT_4G:
5477           asm_fprintf (asm_out_file, ":lo12:");
5478           break;
5479
5480         case SYMBOL_SMALL_TLSGD:
5481           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5482           break;
5483
5484         case SYMBOL_SMALL_TLSDESC:
5485           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5486           break;
5487
5488         case SYMBOL_SMALL_TLSIE:
5489           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5490           break;
5491
5492         case SYMBOL_TLSLE12:
5493           asm_fprintf (asm_out_file, ":tprel_lo12:");
5494           break;
5495
5496         case SYMBOL_TLSLE24:
5497           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5498           break;
5499
5500         case SYMBOL_TINY_GOT:
5501           asm_fprintf (asm_out_file, ":got:");
5502           break;
5503
5504         case SYMBOL_TINY_TLSIE:
5505           asm_fprintf (asm_out_file, ":gottprel:");
5506           break;
5507
5508         default:
5509           break;
5510         }
5511       output_addr_const (asm_out_file, x);
5512       break;
5513
5514     case 'G':
5515       switch (aarch64_classify_symbolic_expression (x))
5516         {
5517         case SYMBOL_TLSLE24:
5518           asm_fprintf (asm_out_file, ":tprel_hi12:");
5519           break;
5520         default:
5521           break;
5522         }
5523       output_addr_const (asm_out_file, x);
5524       break;
5525
5526     case 'k':
5527       {
5528         HOST_WIDE_INT cond_code;
5529
5530         if (!CONST_INT_P (x))
5531           {
5532             output_operand_lossage ("invalid operand for '%%%c'", code);
5533             return;
5534           }
5535
5536         cond_code = INTVAL (x);
5537         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5538         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5539       }
5540       break;
5541
5542     default:
5543       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5544       return;
5545     }
5546 }
5547
5548 static void
5549 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5550 {
5551   struct aarch64_address_info addr;
5552
5553   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5554     switch (addr.type)
5555       {
5556       case ADDRESS_REG_IMM:
5557         if (addr.offset == const0_rtx)
5558           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5559         else
5560           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5561                        INTVAL (addr.offset));
5562         return;
5563
5564       case ADDRESS_REG_REG:
5565         if (addr.shift == 0)
5566           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5567                        reg_names [REGNO (addr.offset)]);
5568         else
5569           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5570                        reg_names [REGNO (addr.offset)], addr.shift);
5571         return;
5572
5573       case ADDRESS_REG_UXTW:
5574         if (addr.shift == 0)
5575           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5576                        REGNO (addr.offset) - R0_REGNUM);
5577         else
5578           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5579                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5580         return;
5581
5582       case ADDRESS_REG_SXTW:
5583         if (addr.shift == 0)
5584           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5585                        REGNO (addr.offset) - R0_REGNUM);
5586         else
5587           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5588                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5589         return;
5590
5591       case ADDRESS_REG_WB:
5592         switch (GET_CODE (x))
5593           {
5594           case PRE_INC:
5595             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5596                          GET_MODE_SIZE (mode));
5597             return;
5598           case POST_INC:
5599             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5600                          GET_MODE_SIZE (mode));
5601             return;
5602           case PRE_DEC:
5603             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5604                          GET_MODE_SIZE (mode));
5605             return;
5606           case POST_DEC:
5607             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5608                          GET_MODE_SIZE (mode));
5609             return;
5610           case PRE_MODIFY:
5611             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5612                          INTVAL (addr.offset));
5613             return;
5614           case POST_MODIFY:
5615             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5616                          INTVAL (addr.offset));
5617             return;
5618           default:
5619             break;
5620           }
5621         break;
5622
5623       case ADDRESS_LO_SUM:
5624         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5625         output_addr_const (f, addr.offset);
5626         asm_fprintf (f, "]");
5627         return;
5628
5629       case ADDRESS_SYMBOLIC:
5630         break;
5631       }
5632
5633   output_addr_const (f, x);
5634 }
5635
5636 bool
5637 aarch64_label_mentioned_p (rtx x)
5638 {
5639   const char *fmt;
5640   int i;
5641
5642   if (GET_CODE (x) == LABEL_REF)
5643     return true;
5644
5645   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5646      referencing instruction, but they are constant offsets, not
5647      symbols.  */
5648   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5649     return false;
5650
5651   fmt = GET_RTX_FORMAT (GET_CODE (x));
5652   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5653     {
5654       if (fmt[i] == 'E')
5655         {
5656           int j;
5657
5658           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5659             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5660               return 1;
5661         }
5662       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5663         return 1;
5664     }
5665
5666   return 0;
5667 }
5668
5669 /* Implement REGNO_REG_CLASS.  */
5670
5671 enum reg_class
5672 aarch64_regno_regclass (unsigned regno)
5673 {
5674   if (GP_REGNUM_P (regno))
5675     return GENERAL_REGS;
5676
5677   if (regno == SP_REGNUM)
5678     return STACK_REG;
5679
5680   if (regno == FRAME_POINTER_REGNUM
5681       || regno == ARG_POINTER_REGNUM)
5682     return POINTER_REGS;
5683
5684   if (FP_REGNUM_P (regno))
5685     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5686
5687   return NO_REGS;
5688 }
5689
5690 static rtx
5691 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5692 {
5693   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5694      where mask is selected by alignment and size of the offset.
5695      We try to pick as large a range for the offset as possible to
5696      maximize the chance of a CSE.  However, for aligned addresses
5697      we limit the range to 4k so that structures with different sized
5698      elements are likely to use the same base.  We need to be careful
5699      not to split a CONST for some forms of address expression, otherwise
5700      it will generate sub-optimal code.  */
5701
5702   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5703     {
5704       rtx base = XEXP (x, 0);
5705       rtx offset_rtx = XEXP (x, 1);
5706       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5707
5708       if (GET_CODE (base) == PLUS)
5709         {
5710           rtx op0 = XEXP (base, 0);
5711           rtx op1 = XEXP (base, 1);
5712
5713           /* Force any scaling into a temp for CSE.  */
5714           op0 = force_reg (Pmode, op0);
5715           op1 = force_reg (Pmode, op1);
5716
5717           /* Let the pointer register be in op0.  */
5718           if (REG_POINTER (op1))
5719             std::swap (op0, op1);
5720
5721           /* If the pointer is virtual or frame related, then we know that
5722              virtual register instantiation or register elimination is going
5723              to apply a second constant.  We want the two constants folded
5724              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5725           if (virt_or_elim_regno_p (REGNO (op0)))
5726             {
5727               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5728                                    NULL_RTX, true, OPTAB_DIRECT);
5729               return gen_rtx_PLUS (Pmode, base, op1);
5730             }
5731
5732           /* Otherwise, in order to encourage CSE (and thence loop strength
5733              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5734           base = expand_binop (Pmode, add_optab, op0, op1,
5735                                NULL_RTX, true, OPTAB_DIRECT);
5736           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5737         }
5738
5739       /* Does it look like we'll need a 16-byte load/store-pair operation?  */
5740       HOST_WIDE_INT base_offset;
5741       if (GET_MODE_SIZE (mode) > 16)
5742         base_offset = (offset + 0x400) & ~0x7f0;
5743       /* For offsets aren't a multiple of the access size, the limit is
5744          -256...255.  */
5745       else if (offset & (GET_MODE_SIZE (mode) - 1))
5746         {
5747           base_offset = (offset + 0x100) & ~0x1ff;
5748
5749           /* BLKmode typically uses LDP of X-registers.  */
5750           if (mode == BLKmode)
5751             base_offset = (offset + 512) & ~0x3ff;
5752         }
5753       /* Small negative offsets are supported.  */
5754       else if (IN_RANGE (offset, -256, 0))
5755         base_offset = 0;
5756       else if (mode == TImode || mode == TFmode)
5757         base_offset = (offset + 0x100) & ~0x1ff;
5758       /* Use 12-bit offset by access size.  */
5759       else
5760         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5761
5762       if (base_offset != 0)
5763         {
5764           base = plus_constant (Pmode, base, base_offset);
5765           base = force_operand (base, NULL_RTX);
5766           return plus_constant (Pmode, base, offset - base_offset);
5767         }
5768     }
5769
5770   return x;
5771 }
5772
5773 /* Return the reload icode required for a constant pool in mode.  */
5774 static enum insn_code
5775 aarch64_constant_pool_reload_icode (machine_mode mode)
5776 {
5777   switch (mode)
5778     {
5779     case E_SFmode:
5780       return CODE_FOR_aarch64_reload_movcpsfdi;
5781
5782     case E_DFmode:
5783       return CODE_FOR_aarch64_reload_movcpdfdi;
5784
5785     case E_TFmode:
5786       return CODE_FOR_aarch64_reload_movcptfdi;
5787
5788     case E_V8QImode:
5789       return CODE_FOR_aarch64_reload_movcpv8qidi;
5790
5791     case E_V16QImode:
5792       return CODE_FOR_aarch64_reload_movcpv16qidi;
5793
5794     case E_V4HImode:
5795       return CODE_FOR_aarch64_reload_movcpv4hidi;
5796
5797     case E_V8HImode:
5798       return CODE_FOR_aarch64_reload_movcpv8hidi;
5799
5800     case E_V2SImode:
5801       return CODE_FOR_aarch64_reload_movcpv2sidi;
5802
5803     case E_V4SImode:
5804       return CODE_FOR_aarch64_reload_movcpv4sidi;
5805
5806     case E_V2DImode:
5807       return CODE_FOR_aarch64_reload_movcpv2didi;
5808
5809     case E_V2DFmode:
5810       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5811
5812     default:
5813       gcc_unreachable ();
5814     }
5815
5816   gcc_unreachable ();
5817 }
5818 static reg_class_t
5819 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5820                           reg_class_t rclass,
5821                           machine_mode mode,
5822                           secondary_reload_info *sri)
5823 {
5824
5825   /* If we have to disable direct literal pool loads and stores because the
5826      function is too big, then we need a scratch register.  */
5827   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5828       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5829           || targetm.vector_mode_supported_p (GET_MODE (x)))
5830       && !aarch64_pcrelative_literal_loads)
5831     {
5832       sri->icode = aarch64_constant_pool_reload_icode (mode);
5833       return NO_REGS;
5834     }
5835
5836   /* Without the TARGET_SIMD instructions we cannot move a Q register
5837      to a Q register directly.  We need a scratch.  */
5838   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5839       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5840       && reg_class_subset_p (rclass, FP_REGS))
5841     {
5842       if (mode == TFmode)
5843         sri->icode = CODE_FOR_aarch64_reload_movtf;
5844       else if (mode == TImode)
5845         sri->icode = CODE_FOR_aarch64_reload_movti;
5846       return NO_REGS;
5847     }
5848
5849   /* A TFmode or TImode memory access should be handled via an FP_REGS
5850      because AArch64 has richer addressing modes for LDR/STR instructions
5851      than LDP/STP instructions.  */
5852   if (TARGET_FLOAT && rclass == GENERAL_REGS
5853       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5854     return FP_REGS;
5855
5856   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5857       return GENERAL_REGS;
5858
5859   return NO_REGS;
5860 }
5861
5862 static bool
5863 aarch64_can_eliminate (const int from, const int to)
5864 {
5865   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5866      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5867
5868   if (frame_pointer_needed)
5869     {
5870       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5871         return true;
5872       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5873         return false;
5874       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5875           && !cfun->calls_alloca)
5876         return true;
5877       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5878         return true;
5879
5880       return false;
5881     }
5882   else
5883     {
5884       /* If we decided that we didn't need a leaf frame pointer but then used
5885          LR in the function, then we'll want a frame pointer after all, so
5886          prevent this elimination to ensure a frame pointer is used.  */
5887       if (to == STACK_POINTER_REGNUM
5888           && flag_omit_leaf_frame_pointer
5889           && df_regs_ever_live_p (LR_REGNUM))
5890         return false;
5891     }
5892
5893   return true;
5894 }
5895
5896 HOST_WIDE_INT
5897 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5898 {
5899   aarch64_layout_frame ();
5900
5901   if (to == HARD_FRAME_POINTER_REGNUM)
5902     {
5903       if (from == ARG_POINTER_REGNUM)
5904         return cfun->machine->frame.hard_fp_offset;
5905
5906       if (from == FRAME_POINTER_REGNUM)
5907         return cfun->machine->frame.hard_fp_offset
5908                - cfun->machine->frame.locals_offset;
5909     }
5910
5911   if (to == STACK_POINTER_REGNUM)
5912     {
5913       if (from == FRAME_POINTER_REGNUM)
5914           return cfun->machine->frame.frame_size
5915                  - cfun->machine->frame.locals_offset;
5916     }
5917
5918   return cfun->machine->frame.frame_size;
5919 }
5920
5921 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5922    previous frame.  */
5923
5924 rtx
5925 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5926 {
5927   if (count != 0)
5928     return const0_rtx;
5929   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5930 }
5931
5932
5933 static void
5934 aarch64_asm_trampoline_template (FILE *f)
5935 {
5936   if (TARGET_ILP32)
5937     {
5938       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5939       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5940     }
5941   else
5942     {
5943       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5944       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5945     }
5946   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5947   assemble_aligned_integer (4, const0_rtx);
5948   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5949   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5950 }
5951
5952 static void
5953 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5954 {
5955   rtx fnaddr, mem, a_tramp;
5956   const int tramp_code_sz = 16;
5957
5958   /* Don't need to copy the trailing D-words, we fill those in below.  */
5959   emit_block_move (m_tramp, assemble_trampoline_template (),
5960                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5961   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5962   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5963   if (GET_MODE (fnaddr) != ptr_mode)
5964     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5965   emit_move_insn (mem, fnaddr);
5966
5967   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5968   emit_move_insn (mem, chain_value);
5969
5970   /* XXX We should really define a "clear_cache" pattern and use
5971      gen_clear_cache().  */
5972   a_tramp = XEXP (m_tramp, 0);
5973   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5974                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5975                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5976                      ptr_mode);
5977 }
5978
5979 static unsigned char
5980 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5981 {
5982   switch (regclass)
5983     {
5984     case CALLER_SAVE_REGS:
5985     case POINTER_REGS:
5986     case GENERAL_REGS:
5987     case ALL_REGS:
5988     case FP_REGS:
5989     case FP_LO_REGS:
5990       return
5991         aarch64_vector_mode_p (mode)
5992           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5993           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5994     case STACK_REG:
5995       return 1;
5996
5997     case NO_REGS:
5998       return 0;
5999
6000     default:
6001       break;
6002     }
6003   gcc_unreachable ();
6004 }
6005
6006 static reg_class_t
6007 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
6008 {
6009   if (regclass == POINTER_REGS)
6010     return GENERAL_REGS;
6011
6012   if (regclass == STACK_REG)
6013     {
6014       if (REG_P(x)
6015           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
6016           return regclass;
6017
6018       return NO_REGS;
6019     }
6020
6021   /* Register eliminiation can result in a request for
6022      SP+constant->FP_REGS.  We cannot support such operations which
6023      use SP as source and an FP_REG as destination, so reject out
6024      right now.  */
6025   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
6026     {
6027       rtx lhs = XEXP (x, 0);
6028
6029       /* Look through a possible SUBREG introduced by ILP32.  */
6030       if (GET_CODE (lhs) == SUBREG)
6031         lhs = SUBREG_REG (lhs);
6032
6033       gcc_assert (REG_P (lhs));
6034       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
6035                                       POINTER_REGS));
6036       return NO_REGS;
6037     }
6038
6039   return regclass;
6040 }
6041
6042 void
6043 aarch64_asm_output_labelref (FILE* f, const char *name)
6044 {
6045   asm_fprintf (f, "%U%s", name);
6046 }
6047
6048 static void
6049 aarch64_elf_asm_constructor (rtx symbol, int priority)
6050 {
6051   if (priority == DEFAULT_INIT_PRIORITY)
6052     default_ctor_section_asm_out_constructor (symbol, priority);
6053   else
6054     {
6055       section *s;
6056       /* While priority is known to be in range [0, 65535], so 18 bytes
6057          would be enough, the compiler might not know that.  To avoid
6058          -Wformat-truncation false positive, use a larger size.  */
6059       char buf[23];
6060       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
6061       s = get_section (buf, SECTION_WRITE, NULL);
6062       switch_to_section (s);
6063       assemble_align (POINTER_SIZE);
6064       assemble_aligned_integer (POINTER_BYTES, symbol);
6065     }
6066 }
6067
6068 static void
6069 aarch64_elf_asm_destructor (rtx symbol, int priority)
6070 {
6071   if (priority == DEFAULT_INIT_PRIORITY)
6072     default_dtor_section_asm_out_destructor (symbol, priority);
6073   else
6074     {
6075       section *s;
6076       /* While priority is known to be in range [0, 65535], so 18 bytes
6077          would be enough, the compiler might not know that.  To avoid
6078          -Wformat-truncation false positive, use a larger size.  */
6079       char buf[23];
6080       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
6081       s = get_section (buf, SECTION_WRITE, NULL);
6082       switch_to_section (s);
6083       assemble_align (POINTER_SIZE);
6084       assemble_aligned_integer (POINTER_BYTES, symbol);
6085     }
6086 }
6087
6088 const char*
6089 aarch64_output_casesi (rtx *operands)
6090 {
6091   char buf[100];
6092   char label[100];
6093   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
6094   int index;
6095   static const char *const patterns[4][2] =
6096   {
6097     {
6098       "ldrb\t%w3, [%0,%w1,uxtw]",
6099       "add\t%3, %4, %w3, sxtb #2"
6100     },
6101     {
6102       "ldrh\t%w3, [%0,%w1,uxtw #1]",
6103       "add\t%3, %4, %w3, sxth #2"
6104     },
6105     {
6106       "ldr\t%w3, [%0,%w1,uxtw #2]",
6107       "add\t%3, %4, %w3, sxtw #2"
6108     },
6109     /* We assume that DImode is only generated when not optimizing and
6110        that we don't really need 64-bit address offsets.  That would
6111        imply an object file with 8GB of code in a single function!  */
6112     {
6113       "ldr\t%w3, [%0,%w1,uxtw #2]",
6114       "add\t%3, %4, %w3, sxtw #2"
6115     }
6116   };
6117
6118   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
6119
6120   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
6121
6122   gcc_assert (index >= 0 && index <= 3);
6123
6124   /* Need to implement table size reduction, by chaning the code below.  */
6125   output_asm_insn (patterns[index][0], operands);
6126   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
6127   snprintf (buf, sizeof (buf),
6128             "adr\t%%4, %s", targetm.strip_name_encoding (label));
6129   output_asm_insn (buf, operands);
6130   output_asm_insn (patterns[index][1], operands);
6131   output_asm_insn ("br\t%3", operands);
6132   assemble_label (asm_out_file, label);
6133   return "";
6134 }
6135
6136
6137 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6138    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6139    operator.  */
6140
6141 int
6142 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
6143 {
6144   if (shift >= 0 && shift <= 3)
6145     {
6146       int size;
6147       for (size = 8; size <= 32; size *= 2)
6148         {
6149           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
6150           if (mask == bits << shift)
6151             return size;
6152         }
6153     }
6154   return 0;
6155 }
6156
6157 /* Constant pools are per function only when PC relative
6158    literal loads are true or we are in the large memory
6159    model.  */
6160
6161 static inline bool
6162 aarch64_can_use_per_function_literal_pools_p (void)
6163 {
6164   return (aarch64_pcrelative_literal_loads
6165           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
6166 }
6167
6168 static bool
6169 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
6170 {
6171   /* Fixme:: In an ideal world this would work similar
6172      to the logic in aarch64_select_rtx_section but this
6173      breaks bootstrap in gcc go.  For now we workaround
6174      this by returning false here.  */
6175   return false;
6176 }
6177
6178 /* Select appropriate section for constants depending
6179    on where we place literal pools.  */
6180
6181 static section *
6182 aarch64_select_rtx_section (machine_mode mode,
6183                             rtx x,
6184                             unsigned HOST_WIDE_INT align)
6185 {
6186   if (aarch64_can_use_per_function_literal_pools_p ())
6187     return function_section (current_function_decl);
6188
6189   return default_elf_select_rtx_section (mode, x, align);
6190 }
6191
6192 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
6193 void
6194 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
6195                                   HOST_WIDE_INT offset)
6196 {
6197   /* When using per-function literal pools, we must ensure that any code
6198      section is aligned to the minimal instruction length, lest we get
6199      errors from the assembler re "unaligned instructions".  */
6200   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
6201     ASM_OUTPUT_ALIGN (f, 2);
6202 }
6203
6204 /* Costs.  */
6205
6206 /* Helper function for rtx cost calculation.  Strip a shift expression
6207    from X.  Returns the inner operand if successful, or the original
6208    expression on failure.  */
6209 static rtx
6210 aarch64_strip_shift (rtx x)
6211 {
6212   rtx op = x;
6213
6214   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6215      we can convert both to ROR during final output.  */
6216   if ((GET_CODE (op) == ASHIFT
6217        || GET_CODE (op) == ASHIFTRT
6218        || GET_CODE (op) == LSHIFTRT
6219        || GET_CODE (op) == ROTATERT
6220        || GET_CODE (op) == ROTATE)
6221       && CONST_INT_P (XEXP (op, 1)))
6222     return XEXP (op, 0);
6223
6224   if (GET_CODE (op) == MULT
6225       && CONST_INT_P (XEXP (op, 1))
6226       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6227     return XEXP (op, 0);
6228
6229   return x;
6230 }
6231
6232 /* Helper function for rtx cost calculation.  Strip an extend
6233    expression from X.  Returns the inner operand if successful, or the
6234    original expression on failure.  We deal with a number of possible
6235    canonicalization variations here. If STRIP_SHIFT is true, then
6236    we can strip off a shift also.  */
6237 static rtx
6238 aarch64_strip_extend (rtx x, bool strip_shift)
6239 {
6240   rtx op = x;
6241
6242   /* Zero and sign extraction of a widened value.  */
6243   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6244       && XEXP (op, 2) == const0_rtx
6245       && GET_CODE (XEXP (op, 0)) == MULT
6246       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6247                                          XEXP (op, 1)))
6248     return XEXP (XEXP (op, 0), 0);
6249
6250   /* It can also be represented (for zero-extend) as an AND with an
6251      immediate.  */
6252   if (GET_CODE (op) == AND
6253       && GET_CODE (XEXP (op, 0)) == MULT
6254       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6255       && CONST_INT_P (XEXP (op, 1))
6256       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6257                            INTVAL (XEXP (op, 1))) != 0)
6258     return XEXP (XEXP (op, 0), 0);
6259
6260   /* Now handle extended register, as this may also have an optional
6261      left shift by 1..4.  */
6262   if (strip_shift
6263       && GET_CODE (op) == ASHIFT
6264       && CONST_INT_P (XEXP (op, 1))
6265       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6266     op = XEXP (op, 0);
6267
6268   if (GET_CODE (op) == ZERO_EXTEND
6269       || GET_CODE (op) == SIGN_EXTEND)
6270     op = XEXP (op, 0);
6271
6272   if (op != x)
6273     return op;
6274
6275   return x;
6276 }
6277
6278 /* Return true iff CODE is a shift supported in combination
6279    with arithmetic instructions.  */
6280
6281 static bool
6282 aarch64_shift_p (enum rtx_code code)
6283 {
6284   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6285 }
6286
6287
6288 /* Return true iff X is a cheap shift without a sign extend. */
6289
6290 static bool
6291 aarch64_cheap_mult_shift_p (rtx x)
6292 {
6293   rtx op0, op1;
6294
6295   op0 = XEXP (x, 0);
6296   op1 = XEXP (x, 1);
6297
6298   if (!(aarch64_tune_params.extra_tuning_flags
6299                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
6300     return false;
6301
6302   if (GET_CODE (op0) == SIGN_EXTEND)
6303     return false;
6304
6305   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
6306       && UINTVAL (op1) <= 4)
6307     return true;
6308
6309   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
6310     return false;
6311
6312   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
6313
6314   if (l2 > 0 && l2 <= 4)
6315     return true;
6316
6317   return false;
6318 }
6319
6320 /* Helper function for rtx cost calculation.  Calculate the cost of
6321    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6322    Return the calculated cost of the expression, recursing manually in to
6323    operands where needed.  */
6324
6325 static int
6326 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6327 {
6328   rtx op0, op1;
6329   const struct cpu_cost_table *extra_cost
6330     = aarch64_tune_params.insn_extra_cost;
6331   int cost = 0;
6332   bool compound_p = (outer == PLUS || outer == MINUS);
6333   machine_mode mode = GET_MODE (x);
6334
6335   gcc_checking_assert (code == MULT);
6336
6337   op0 = XEXP (x, 0);
6338   op1 = XEXP (x, 1);
6339
6340   if (VECTOR_MODE_P (mode))
6341     mode = GET_MODE_INNER (mode);
6342
6343   /* Integer multiply/fma.  */
6344   if (GET_MODE_CLASS (mode) == MODE_INT)
6345     {
6346       /* The multiply will be canonicalized as a shift, cost it as such.  */
6347       if (aarch64_shift_p (GET_CODE (x))
6348           || (CONST_INT_P (op1)
6349               && exact_log2 (INTVAL (op1)) > 0))
6350         {
6351           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6352                            || GET_CODE (op0) == SIGN_EXTEND;
6353           if (speed)
6354             {
6355               if (compound_p)
6356                 {
6357                   /* If the shift is considered cheap,
6358                      then don't add any cost. */
6359                   if (aarch64_cheap_mult_shift_p (x))
6360                     ;
6361                   else if (REG_P (op1))
6362                     /* ARITH + shift-by-register.  */
6363                     cost += extra_cost->alu.arith_shift_reg;
6364                   else if (is_extend)
6365                     /* ARITH + extended register.  We don't have a cost field
6366                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6367                     cost += extra_cost->alu.extend_arith;
6368                   else
6369                     /* ARITH + shift-by-immediate.  */
6370                     cost += extra_cost->alu.arith_shift;
6371                 }
6372               else
6373                 /* LSL (immediate).  */
6374                 cost += extra_cost->alu.shift;
6375
6376             }
6377           /* Strip extends as we will have costed them in the case above.  */
6378           if (is_extend)
6379             op0 = aarch64_strip_extend (op0, true);
6380
6381           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6382
6383           return cost;
6384         }
6385
6386       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6387          compound and let the below cases handle it.  After all, MNEG is a
6388          special-case alias of MSUB.  */
6389       if (GET_CODE (op0) == NEG)
6390         {
6391           op0 = XEXP (op0, 0);
6392           compound_p = true;
6393         }
6394
6395       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6396       if ((GET_CODE (op0) == ZERO_EXTEND
6397            && GET_CODE (op1) == ZERO_EXTEND)
6398           || (GET_CODE (op0) == SIGN_EXTEND
6399               && GET_CODE (op1) == SIGN_EXTEND))
6400         {
6401           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6402           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6403
6404           if (speed)
6405             {
6406               if (compound_p)
6407                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6408                 cost += extra_cost->mult[0].extend_add;
6409               else
6410                 /* MUL/SMULL/UMULL.  */
6411                 cost += extra_cost->mult[0].extend;
6412             }
6413
6414           return cost;
6415         }
6416
6417       /* This is either an integer multiply or a MADD.  In both cases
6418          we want to recurse and cost the operands.  */
6419       cost += rtx_cost (op0, mode, MULT, 0, speed);
6420       cost += rtx_cost (op1, mode, MULT, 1, speed);
6421
6422       if (speed)
6423         {
6424           if (compound_p)
6425             /* MADD/MSUB.  */
6426             cost += extra_cost->mult[mode == DImode].add;
6427           else
6428             /* MUL.  */
6429             cost += extra_cost->mult[mode == DImode].simple;
6430         }
6431
6432       return cost;
6433     }
6434   else
6435     {
6436       if (speed)
6437         {
6438           /* Floating-point FMA/FMUL can also support negations of the
6439              operands, unless the rounding mode is upward or downward in
6440              which case FNMUL is different than FMUL with operand negation.  */
6441           bool neg0 = GET_CODE (op0) == NEG;
6442           bool neg1 = GET_CODE (op1) == NEG;
6443           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6444             {
6445               if (neg0)
6446                 op0 = XEXP (op0, 0);
6447               if (neg1)
6448                 op1 = XEXP (op1, 0);
6449             }
6450
6451           if (compound_p)
6452             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6453             cost += extra_cost->fp[mode == DFmode].fma;
6454           else
6455             /* FMUL/FNMUL.  */
6456             cost += extra_cost->fp[mode == DFmode].mult;
6457         }
6458
6459       cost += rtx_cost (op0, mode, MULT, 0, speed);
6460       cost += rtx_cost (op1, mode, MULT, 1, speed);
6461       return cost;
6462     }
6463 }
6464
6465 static int
6466 aarch64_address_cost (rtx x,
6467                       machine_mode mode,
6468                       addr_space_t as ATTRIBUTE_UNUSED,
6469                       bool speed)
6470 {
6471   enum rtx_code c = GET_CODE (x);
6472   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6473   struct aarch64_address_info info;
6474   int cost = 0;
6475   info.shift = 0;
6476
6477   if (!aarch64_classify_address (&info, x, mode, c, false))
6478     {
6479       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6480         {
6481           /* This is a CONST or SYMBOL ref which will be split
6482              in a different way depending on the code model in use.
6483              Cost it through the generic infrastructure.  */
6484           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6485           /* Divide through by the cost of one instruction to
6486              bring it to the same units as the address costs.  */
6487           cost_symbol_ref /= COSTS_N_INSNS (1);
6488           /* The cost is then the cost of preparing the address,
6489              followed by an immediate (possibly 0) offset.  */
6490           return cost_symbol_ref + addr_cost->imm_offset;
6491         }
6492       else
6493         {
6494           /* This is most likely a jump table from a case
6495              statement.  */
6496           return addr_cost->register_offset;
6497         }
6498     }
6499
6500   switch (info.type)
6501     {
6502       case ADDRESS_LO_SUM:
6503       case ADDRESS_SYMBOLIC:
6504       case ADDRESS_REG_IMM:
6505         cost += addr_cost->imm_offset;
6506         break;
6507
6508       case ADDRESS_REG_WB:
6509         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6510           cost += addr_cost->pre_modify;
6511         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6512           cost += addr_cost->post_modify;
6513         else
6514           gcc_unreachable ();
6515
6516         break;
6517
6518       case ADDRESS_REG_REG:
6519         cost += addr_cost->register_offset;
6520         break;
6521
6522       case ADDRESS_REG_SXTW:
6523         cost += addr_cost->register_sextend;
6524         break;
6525
6526       case ADDRESS_REG_UXTW:
6527         cost += addr_cost->register_zextend;
6528         break;
6529
6530       default:
6531         gcc_unreachable ();
6532     }
6533
6534
6535   if (info.shift > 0)
6536     {
6537       /* For the sake of calculating the cost of the shifted register
6538          component, we can treat same sized modes in the same way.  */
6539       switch (GET_MODE_BITSIZE (mode))
6540         {
6541           case 16:
6542             cost += addr_cost->addr_scale_costs.hi;
6543             break;
6544
6545           case 32:
6546             cost += addr_cost->addr_scale_costs.si;
6547             break;
6548
6549           case 64:
6550             cost += addr_cost->addr_scale_costs.di;
6551             break;
6552
6553           /* We can't tell, or this is a 128-bit vector.  */
6554           default:
6555             cost += addr_cost->addr_scale_costs.ti;
6556             break;
6557         }
6558     }
6559
6560   return cost;
6561 }
6562
6563 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6564    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6565    to be taken.  */
6566
6567 int
6568 aarch64_branch_cost (bool speed_p, bool predictable_p)
6569 {
6570   /* When optimizing for speed, use the cost of unpredictable branches.  */
6571   const struct cpu_branch_cost *branch_costs =
6572     aarch64_tune_params.branch_costs;
6573
6574   if (!speed_p || predictable_p)
6575     return branch_costs->predictable;
6576   else
6577     return branch_costs->unpredictable;
6578 }
6579
6580 /* Return true if the RTX X in mode MODE is a zero or sign extract
6581    usable in an ADD or SUB (extended register) instruction.  */
6582 static bool
6583 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6584 {
6585   /* Catch add with a sign extract.
6586      This is add_<optab><mode>_multp2.  */
6587   if (GET_CODE (x) == SIGN_EXTRACT
6588       || GET_CODE (x) == ZERO_EXTRACT)
6589     {
6590       rtx op0 = XEXP (x, 0);
6591       rtx op1 = XEXP (x, 1);
6592       rtx op2 = XEXP (x, 2);
6593
6594       if (GET_CODE (op0) == MULT
6595           && CONST_INT_P (op1)
6596           && op2 == const0_rtx
6597           && CONST_INT_P (XEXP (op0, 1))
6598           && aarch64_is_extend_from_extract (mode,
6599                                              XEXP (op0, 1),
6600                                              op1))
6601         {
6602           return true;
6603         }
6604     }
6605   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6606      No shift.  */
6607   else if (GET_CODE (x) == SIGN_EXTEND
6608            || GET_CODE (x) == ZERO_EXTEND)
6609     return REG_P (XEXP (x, 0));
6610
6611   return false;
6612 }
6613
6614 static bool
6615 aarch64_frint_unspec_p (unsigned int u)
6616 {
6617   switch (u)
6618     {
6619       case UNSPEC_FRINTZ:
6620       case UNSPEC_FRINTP:
6621       case UNSPEC_FRINTM:
6622       case UNSPEC_FRINTA:
6623       case UNSPEC_FRINTN:
6624       case UNSPEC_FRINTX:
6625       case UNSPEC_FRINTI:
6626         return true;
6627
6628       default:
6629         return false;
6630     }
6631 }
6632
6633 /* Return true iff X is an rtx that will match an extr instruction
6634    i.e. as described in the *extr<mode>5_insn family of patterns.
6635    OP0 and OP1 will be set to the operands of the shifts involved
6636    on success and will be NULL_RTX otherwise.  */
6637
6638 static bool
6639 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6640 {
6641   rtx op0, op1;
6642   machine_mode mode = GET_MODE (x);
6643
6644   *res_op0 = NULL_RTX;
6645   *res_op1 = NULL_RTX;
6646
6647   if (GET_CODE (x) != IOR)
6648     return false;
6649
6650   op0 = XEXP (x, 0);
6651   op1 = XEXP (x, 1);
6652
6653   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6654       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6655     {
6656      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6657       if (GET_CODE (op1) == ASHIFT)
6658         std::swap (op0, op1);
6659
6660       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6661         return false;
6662
6663       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6664       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6665
6666       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6667           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6668         {
6669           *res_op0 = XEXP (op0, 0);
6670           *res_op1 = XEXP (op1, 0);
6671           return true;
6672         }
6673     }
6674
6675   return false;
6676 }
6677
6678 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6679    storing it in *COST.  Result is true if the total cost of the operation
6680    has now been calculated.  */
6681 static bool
6682 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6683 {
6684   rtx inner;
6685   rtx comparator;
6686   enum rtx_code cmpcode;
6687
6688   if (COMPARISON_P (op0))
6689     {
6690       inner = XEXP (op0, 0);
6691       comparator = XEXP (op0, 1);
6692       cmpcode = GET_CODE (op0);
6693     }
6694   else
6695     {
6696       inner = op0;
6697       comparator = const0_rtx;
6698       cmpcode = NE;
6699     }
6700
6701   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6702     {
6703       /* Conditional branch.  */
6704       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6705         return true;
6706       else
6707         {
6708           if (cmpcode == NE || cmpcode == EQ)
6709             {
6710               if (comparator == const0_rtx)
6711                 {
6712                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6713                   if (GET_CODE (inner) == ZERO_EXTRACT)
6714                     /* TBZ/TBNZ.  */
6715                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6716                                        ZERO_EXTRACT, 0, speed);
6717                   else
6718                     /* CBZ/CBNZ.  */
6719                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6720
6721                 return true;
6722               }
6723             }
6724           else if (cmpcode == LT || cmpcode == GE)
6725             {
6726               /* TBZ/TBNZ.  */
6727               if (comparator == const0_rtx)
6728                 return true;
6729             }
6730         }
6731     }
6732   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6733     {
6734       /* CCMP.  */
6735       if (GET_CODE (op1) == COMPARE)
6736         {
6737           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6738           if (XEXP (op1, 1) == const0_rtx)
6739             *cost += 1;
6740           if (speed)
6741             {
6742               machine_mode mode = GET_MODE (XEXP (op1, 0));
6743               const struct cpu_cost_table *extra_cost
6744                 = aarch64_tune_params.insn_extra_cost;
6745
6746               if (GET_MODE_CLASS (mode) == MODE_INT)
6747                 *cost += extra_cost->alu.arith;
6748               else
6749                 *cost += extra_cost->fp[mode == DFmode].compare;
6750             }
6751           return true;
6752         }
6753
6754       /* It's a conditional operation based on the status flags,
6755          so it must be some flavor of CSEL.  */
6756
6757       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6758       if (GET_CODE (op1) == NEG
6759           || GET_CODE (op1) == NOT
6760           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6761         op1 = XEXP (op1, 0);
6762       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6763         {
6764           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6765           op1 = XEXP (op1, 0);
6766           op2 = XEXP (op2, 0);
6767         }
6768
6769       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6770       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6771       return true;
6772     }
6773
6774   /* We don't know what this is, cost all operands.  */
6775   return false;
6776 }
6777
6778 /* Check whether X is a bitfield operation of the form shift + extend that
6779    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6780    operand to which the bitfield operation is applied.  Otherwise return
6781    NULL_RTX.  */
6782
6783 static rtx
6784 aarch64_extend_bitfield_pattern_p (rtx x)
6785 {
6786   rtx_code outer_code = GET_CODE (x);
6787   machine_mode outer_mode = GET_MODE (x);
6788
6789   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6790       && outer_mode != SImode && outer_mode != DImode)
6791     return NULL_RTX;
6792
6793   rtx inner = XEXP (x, 0);
6794   rtx_code inner_code = GET_CODE (inner);
6795   machine_mode inner_mode = GET_MODE (inner);
6796   rtx op = NULL_RTX;
6797
6798   switch (inner_code)
6799     {
6800       case ASHIFT:
6801         if (CONST_INT_P (XEXP (inner, 1))
6802             && (inner_mode == QImode || inner_mode == HImode))
6803           op = XEXP (inner, 0);
6804         break;
6805       case LSHIFTRT:
6806         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6807             && (inner_mode == QImode || inner_mode == HImode))
6808           op = XEXP (inner, 0);
6809         break;
6810       case ASHIFTRT:
6811         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6812             && (inner_mode == QImode || inner_mode == HImode))
6813           op = XEXP (inner, 0);
6814         break;
6815       default:
6816         break;
6817     }
6818
6819   return op;
6820 }
6821
6822 /* Return true if the mask and a shift amount from an RTX of the form
6823    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6824    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6825
6826 bool
6827 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6828 {
6829   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6830          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6831          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6832          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6833 }
6834
6835 /* Calculate the cost of calculating X, storing it in *COST.  Result
6836    is true if the total cost of the operation has now been calculated.  */
6837 static bool
6838 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6839                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6840 {
6841   rtx op0, op1, op2;
6842   const struct cpu_cost_table *extra_cost
6843     = aarch64_tune_params.insn_extra_cost;
6844   int code = GET_CODE (x);
6845
6846   /* By default, assume that everything has equivalent cost to the
6847      cheapest instruction.  Any additional costs are applied as a delta
6848      above this default.  */
6849   *cost = COSTS_N_INSNS (1);
6850
6851   switch (code)
6852     {
6853     case SET:
6854       /* The cost depends entirely on the operands to SET.  */
6855       *cost = 0;
6856       op0 = SET_DEST (x);
6857       op1 = SET_SRC (x);
6858
6859       switch (GET_CODE (op0))
6860         {
6861         case MEM:
6862           if (speed)
6863             {
6864               rtx address = XEXP (op0, 0);
6865               if (VECTOR_MODE_P (mode))
6866                 *cost += extra_cost->ldst.storev;
6867               else if (GET_MODE_CLASS (mode) == MODE_INT)
6868                 *cost += extra_cost->ldst.store;
6869               else if (mode == SFmode)
6870                 *cost += extra_cost->ldst.storef;
6871               else if (mode == DFmode)
6872                 *cost += extra_cost->ldst.stored;
6873
6874               *cost +=
6875                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6876                                                      0, speed));
6877             }
6878
6879           *cost += rtx_cost (op1, mode, SET, 1, speed);
6880           return true;
6881
6882         case SUBREG:
6883           if (! REG_P (SUBREG_REG (op0)))
6884             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6885
6886           /* Fall through.  */
6887         case REG:
6888           /* The cost is one per vector-register copied.  */
6889           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6890             {
6891               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6892                               / GET_MODE_SIZE (V4SImode);
6893               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6894             }
6895           /* const0_rtx is in general free, but we will use an
6896              instruction to set a register to 0.  */
6897           else if (REG_P (op1) || op1 == const0_rtx)
6898             {
6899               /* The cost is 1 per register copied.  */
6900               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6901                               / UNITS_PER_WORD;
6902               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6903             }
6904           else
6905             /* Cost is just the cost of the RHS of the set.  */
6906             *cost += rtx_cost (op1, mode, SET, 1, speed);
6907           return true;
6908
6909         case ZERO_EXTRACT:
6910         case SIGN_EXTRACT:
6911           /* Bit-field insertion.  Strip any redundant widening of
6912              the RHS to meet the width of the target.  */
6913           if (GET_CODE (op1) == SUBREG)
6914             op1 = SUBREG_REG (op1);
6915           if ((GET_CODE (op1) == ZERO_EXTEND
6916                || GET_CODE (op1) == SIGN_EXTEND)
6917               && CONST_INT_P (XEXP (op0, 1))
6918               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6919                   >= INTVAL (XEXP (op0, 1))))
6920             op1 = XEXP (op1, 0);
6921
6922           if (CONST_INT_P (op1))
6923             {
6924               /* MOV immediate is assumed to always be cheap.  */
6925               *cost = COSTS_N_INSNS (1);
6926             }
6927           else
6928             {
6929               /* BFM.  */
6930               if (speed)
6931                 *cost += extra_cost->alu.bfi;
6932               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6933             }
6934
6935           return true;
6936
6937         default:
6938           /* We can't make sense of this, assume default cost.  */
6939           *cost = COSTS_N_INSNS (1);
6940           return false;
6941         }
6942       return false;
6943
6944     case CONST_INT:
6945       /* If an instruction can incorporate a constant within the
6946          instruction, the instruction's expression avoids calling
6947          rtx_cost() on the constant.  If rtx_cost() is called on a
6948          constant, then it is usually because the constant must be
6949          moved into a register by one or more instructions.
6950
6951          The exception is constant 0, which can be expressed
6952          as XZR/WZR and is therefore free.  The exception to this is
6953          if we have (set (reg) (const0_rtx)) in which case we must cost
6954          the move.  However, we can catch that when we cost the SET, so
6955          we don't need to consider that here.  */
6956       if (x == const0_rtx)
6957         *cost = 0;
6958       else
6959         {
6960           /* To an approximation, building any other constant is
6961              proportionally expensive to the number of instructions
6962              required to build that constant.  This is true whether we
6963              are compiling for SPEED or otherwise.  */
6964           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6965                                  (NULL_RTX, x, false, mode));
6966         }
6967       return true;
6968
6969     case CONST_DOUBLE:
6970
6971       /* First determine number of instructions to do the move
6972           as an integer constant.  */
6973       if (!aarch64_float_const_representable_p (x)
6974            && !aarch64_can_const_movi_rtx_p (x, mode)
6975            && aarch64_float_const_rtx_p (x))
6976         {
6977           unsigned HOST_WIDE_INT ival;
6978           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
6979           gcc_assert (succeed);
6980
6981           machine_mode imode = mode == HFmode ? SImode
6982                                               : int_mode_for_mode (mode);
6983           int ncost = aarch64_internal_mov_immediate
6984                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6985           *cost += COSTS_N_INSNS (ncost);
6986           return true;
6987         }
6988
6989       if (speed)
6990         {
6991           /* mov[df,sf]_aarch64.  */
6992           if (aarch64_float_const_representable_p (x))
6993             /* FMOV (scalar immediate).  */
6994             *cost += extra_cost->fp[mode == DFmode].fpconst;
6995           else if (!aarch64_float_const_zero_rtx_p (x))
6996             {
6997               /* This will be a load from memory.  */
6998               if (mode == DFmode)
6999                 *cost += extra_cost->ldst.loadd;
7000               else
7001                 *cost += extra_cost->ldst.loadf;
7002             }
7003           else
7004             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
7005                or MOV v0.s[0], wzr - neither of which are modeled by the
7006                cost tables.  Just use the default cost.  */
7007             {
7008             }
7009         }
7010
7011       return true;
7012
7013     case MEM:
7014       if (speed)
7015         {
7016           /* For loads we want the base cost of a load, plus an
7017              approximation for the additional cost of the addressing
7018              mode.  */
7019           rtx address = XEXP (x, 0);
7020           if (VECTOR_MODE_P (mode))
7021             *cost += extra_cost->ldst.loadv;
7022           else if (GET_MODE_CLASS (mode) == MODE_INT)
7023             *cost += extra_cost->ldst.load;
7024           else if (mode == SFmode)
7025             *cost += extra_cost->ldst.loadf;
7026           else if (mode == DFmode)
7027             *cost += extra_cost->ldst.loadd;
7028
7029           *cost +=
7030                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7031                                                      0, speed));
7032         }
7033
7034       return true;
7035
7036     case NEG:
7037       op0 = XEXP (x, 0);
7038
7039       if (VECTOR_MODE_P (mode))
7040         {
7041           if (speed)
7042             {
7043               /* FNEG.  */
7044               *cost += extra_cost->vect.alu;
7045             }
7046           return false;
7047         }
7048
7049       if (GET_MODE_CLASS (mode) == MODE_INT)
7050         {
7051           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7052               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7053             {
7054               /* CSETM.  */
7055               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
7056               return true;
7057             }
7058
7059           /* Cost this as SUB wzr, X.  */
7060           op0 = CONST0_RTX (mode);
7061           op1 = XEXP (x, 0);
7062           goto cost_minus;
7063         }
7064
7065       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7066         {
7067           /* Support (neg(fma...)) as a single instruction only if
7068              sign of zeros is unimportant.  This matches the decision
7069              making in aarch64.md.  */
7070           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
7071             {
7072               /* FNMADD.  */
7073               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7074               return true;
7075             }
7076           if (GET_CODE (op0) == MULT)
7077             {
7078               /* FNMUL.  */
7079               *cost = rtx_cost (op0, mode, NEG, 0, speed);
7080               return true;
7081             }
7082           if (speed)
7083             /* FNEG.  */
7084             *cost += extra_cost->fp[mode == DFmode].neg;
7085           return false;
7086         }
7087
7088       return false;
7089
7090     case CLRSB:
7091     case CLZ:
7092       if (speed)
7093         {
7094           if (VECTOR_MODE_P (mode))
7095             *cost += extra_cost->vect.alu;
7096           else
7097             *cost += extra_cost->alu.clz;
7098         }
7099
7100       return false;
7101
7102     case COMPARE:
7103       op0 = XEXP (x, 0);
7104       op1 = XEXP (x, 1);
7105
7106       if (op1 == const0_rtx
7107           && GET_CODE (op0) == AND)
7108         {
7109           x = op0;
7110           mode = GET_MODE (op0);
7111           goto cost_logic;
7112         }
7113
7114       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
7115         {
7116           /* TODO: A write to the CC flags possibly costs extra, this
7117              needs encoding in the cost tables.  */
7118
7119           mode = GET_MODE (op0);
7120           /* ANDS.  */
7121           if (GET_CODE (op0) == AND)
7122             {
7123               x = op0;
7124               goto cost_logic;
7125             }
7126
7127           if (GET_CODE (op0) == PLUS)
7128             {
7129               /* ADDS (and CMN alias).  */
7130               x = op0;
7131               goto cost_plus;
7132             }
7133
7134           if (GET_CODE (op0) == MINUS)
7135             {
7136               /* SUBS.  */
7137               x = op0;
7138               goto cost_minus;
7139             }
7140
7141           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
7142               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
7143               && CONST_INT_P (XEXP (op0, 2)))
7144             {
7145               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7146                  Handle it here directly rather than going to cost_logic
7147                  since we know the immediate generated for the TST is valid
7148                  so we can avoid creating an intermediate rtx for it only
7149                  for costing purposes.  */
7150               if (speed)
7151                 *cost += extra_cost->alu.logical;
7152
7153               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
7154                                  ZERO_EXTRACT, 0, speed);
7155               return true;
7156             }
7157
7158           if (GET_CODE (op1) == NEG)
7159             {
7160               /* CMN.  */
7161               if (speed)
7162                 *cost += extra_cost->alu.arith;
7163
7164               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
7165               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
7166               return true;
7167             }
7168
7169           /* CMP.
7170
7171              Compare can freely swap the order of operands, and
7172              canonicalization puts the more complex operation first.
7173              But the integer MINUS logic expects the shift/extend
7174              operation in op1.  */
7175           if (! (REG_P (op0)
7176                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
7177           {
7178             op0 = XEXP (x, 1);
7179             op1 = XEXP (x, 0);
7180           }
7181           goto cost_minus;
7182         }
7183
7184       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
7185         {
7186           /* FCMP.  */
7187           if (speed)
7188             *cost += extra_cost->fp[mode == DFmode].compare;
7189
7190           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
7191             {
7192               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
7193               /* FCMP supports constant 0.0 for no extra cost. */
7194               return true;
7195             }
7196           return false;
7197         }
7198
7199       if (VECTOR_MODE_P (mode))
7200         {
7201           /* Vector compare.  */
7202           if (speed)
7203             *cost += extra_cost->vect.alu;
7204
7205           if (aarch64_float_const_zero_rtx_p (op1))
7206             {
7207               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7208                  cost.  */
7209               return true;
7210             }
7211           return false;
7212         }
7213       return false;
7214
7215     case MINUS:
7216       {
7217         op0 = XEXP (x, 0);
7218         op1 = XEXP (x, 1);
7219
7220 cost_minus:
7221         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
7222
7223         /* Detect valid immediates.  */
7224         if ((GET_MODE_CLASS (mode) == MODE_INT
7225              || (GET_MODE_CLASS (mode) == MODE_CC
7226                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
7227             && CONST_INT_P (op1)
7228             && aarch64_uimm12_shift (INTVAL (op1)))
7229           {
7230             if (speed)
7231               /* SUB(S) (immediate).  */
7232               *cost += extra_cost->alu.arith;
7233             return true;
7234           }
7235
7236         /* Look for SUB (extended register).  */
7237         if (aarch64_rtx_arith_op_extract_p (op1, mode))
7238           {
7239             if (speed)
7240               *cost += extra_cost->alu.extend_arith;
7241
7242             op1 = aarch64_strip_extend (op1, true);
7243             *cost += rtx_cost (op1, VOIDmode,
7244                                (enum rtx_code) GET_CODE (op1), 0, speed);
7245             return true;
7246           }
7247
7248         rtx new_op1 = aarch64_strip_extend (op1, false);
7249
7250         /* Cost this as an FMA-alike operation.  */
7251         if ((GET_CODE (new_op1) == MULT
7252              || aarch64_shift_p (GET_CODE (new_op1)))
7253             && code != COMPARE)
7254           {
7255             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
7256                                             (enum rtx_code) code,
7257                                             speed);
7258             return true;
7259           }
7260
7261         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
7262
7263         if (speed)
7264           {
7265             if (VECTOR_MODE_P (mode))
7266               {
7267                 /* Vector SUB.  */
7268                 *cost += extra_cost->vect.alu;
7269               }
7270             else if (GET_MODE_CLASS (mode) == MODE_INT)
7271               {
7272                 /* SUB(S).  */
7273                 *cost += extra_cost->alu.arith;
7274               }
7275             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7276               {
7277                 /* FSUB.  */
7278                 *cost += extra_cost->fp[mode == DFmode].addsub;
7279               }
7280           }
7281         return true;
7282       }
7283
7284     case PLUS:
7285       {
7286         rtx new_op0;
7287
7288         op0 = XEXP (x, 0);
7289         op1 = XEXP (x, 1);
7290
7291 cost_plus:
7292         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7293             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7294           {
7295             /* CSINC.  */
7296             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7297             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7298             return true;
7299           }
7300
7301         if (GET_MODE_CLASS (mode) == MODE_INT
7302             && CONST_INT_P (op1)
7303             && aarch64_uimm12_shift (INTVAL (op1)))
7304           {
7305             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7306
7307             if (speed)
7308               /* ADD (immediate).  */
7309               *cost += extra_cost->alu.arith;
7310             return true;
7311           }
7312
7313         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7314
7315         /* Look for ADD (extended register).  */
7316         if (aarch64_rtx_arith_op_extract_p (op0, mode))
7317           {
7318             if (speed)
7319               *cost += extra_cost->alu.extend_arith;
7320
7321             op0 = aarch64_strip_extend (op0, true);
7322             *cost += rtx_cost (op0, VOIDmode,
7323                                (enum rtx_code) GET_CODE (op0), 0, speed);
7324             return true;
7325           }
7326
7327         /* Strip any extend, leave shifts behind as we will
7328            cost them through mult_cost.  */
7329         new_op0 = aarch64_strip_extend (op0, false);
7330
7331         if (GET_CODE (new_op0) == MULT
7332             || aarch64_shift_p (GET_CODE (new_op0)))
7333           {
7334             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7335                                             speed);
7336             return true;
7337           }
7338
7339         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7340
7341         if (speed)
7342           {
7343             if (VECTOR_MODE_P (mode))
7344               {
7345                 /* Vector ADD.  */
7346                 *cost += extra_cost->vect.alu;
7347               }
7348             else if (GET_MODE_CLASS (mode) == MODE_INT)
7349               {
7350                 /* ADD.  */
7351                 *cost += extra_cost->alu.arith;
7352               }
7353             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7354               {
7355                 /* FADD.  */
7356                 *cost += extra_cost->fp[mode == DFmode].addsub;
7357               }
7358           }
7359         return true;
7360       }
7361
7362     case BSWAP:
7363       *cost = COSTS_N_INSNS (1);
7364
7365       if (speed)
7366         {
7367           if (VECTOR_MODE_P (mode))
7368             *cost += extra_cost->vect.alu;
7369           else
7370             *cost += extra_cost->alu.rev;
7371         }
7372       return false;
7373
7374     case IOR:
7375       if (aarch_rev16_p (x))
7376         {
7377           *cost = COSTS_N_INSNS (1);
7378
7379           if (speed)
7380             {
7381               if (VECTOR_MODE_P (mode))
7382                 *cost += extra_cost->vect.alu;
7383               else
7384                 *cost += extra_cost->alu.rev;
7385             }
7386           return true;
7387         }
7388
7389       if (aarch64_extr_rtx_p (x, &op0, &op1))
7390         {
7391           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7392           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7393           if (speed)
7394             *cost += extra_cost->alu.shift;
7395
7396           return true;
7397         }
7398     /* Fall through.  */
7399     case XOR:
7400     case AND:
7401     cost_logic:
7402       op0 = XEXP (x, 0);
7403       op1 = XEXP (x, 1);
7404
7405       if (VECTOR_MODE_P (mode))
7406         {
7407           if (speed)
7408             *cost += extra_cost->vect.alu;
7409           return true;
7410         }
7411
7412       if (code == AND
7413           && GET_CODE (op0) == MULT
7414           && CONST_INT_P (XEXP (op0, 1))
7415           && CONST_INT_P (op1)
7416           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7417                                INTVAL (op1)) != 0)
7418         {
7419           /* This is a UBFM/SBFM.  */
7420           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7421           if (speed)
7422             *cost += extra_cost->alu.bfx;
7423           return true;
7424         }
7425
7426       if (GET_MODE_CLASS (mode) == MODE_INT)
7427         {
7428           if (CONST_INT_P (op1))
7429             {
7430               /* We have a mask + shift version of a UBFIZ
7431                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7432               if (GET_CODE (op0) == ASHIFT
7433                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7434                                                           XEXP (op0, 1)))
7435                 {
7436                   *cost += rtx_cost (XEXP (op0, 0), mode,
7437                                      (enum rtx_code) code, 0, speed);
7438                   if (speed)
7439                     *cost += extra_cost->alu.bfx;
7440
7441                   return true;
7442                 }
7443               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7444                 {
7445                 /* We possibly get the immediate for free, this is not
7446                    modelled.  */
7447                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7448                   if (speed)
7449                     *cost += extra_cost->alu.logical;
7450
7451                   return true;
7452                 }
7453             }
7454           else
7455             {
7456               rtx new_op0 = op0;
7457
7458               /* Handle ORN, EON, or BIC.  */
7459               if (GET_CODE (op0) == NOT)
7460                 op0 = XEXP (op0, 0);
7461
7462               new_op0 = aarch64_strip_shift (op0);
7463
7464               /* If we had a shift on op0 then this is a logical-shift-
7465                  by-register/immediate operation.  Otherwise, this is just
7466                  a logical operation.  */
7467               if (speed)
7468                 {
7469                   if (new_op0 != op0)
7470                     {
7471                       /* Shift by immediate.  */
7472                       if (CONST_INT_P (XEXP (op0, 1)))
7473                         *cost += extra_cost->alu.log_shift;
7474                       else
7475                         *cost += extra_cost->alu.log_shift_reg;
7476                     }
7477                   else
7478                     *cost += extra_cost->alu.logical;
7479                 }
7480
7481               /* In both cases we want to cost both operands.  */
7482               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7483               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7484
7485               return true;
7486             }
7487         }
7488       return false;
7489
7490     case NOT:
7491       x = XEXP (x, 0);
7492       op0 = aarch64_strip_shift (x);
7493
7494       if (VECTOR_MODE_P (mode))
7495         {
7496           /* Vector NOT.  */
7497           *cost += extra_cost->vect.alu;
7498           return false;
7499         }
7500
7501       /* MVN-shifted-reg.  */
7502       if (op0 != x)
7503         {
7504           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7505
7506           if (speed)
7507             *cost += extra_cost->alu.log_shift;
7508
7509           return true;
7510         }
7511       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7512          Handle the second form here taking care that 'a' in the above can
7513          be a shift.  */
7514       else if (GET_CODE (op0) == XOR)
7515         {
7516           rtx newop0 = XEXP (op0, 0);
7517           rtx newop1 = XEXP (op0, 1);
7518           rtx op0_stripped = aarch64_strip_shift (newop0);
7519
7520           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7521           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7522
7523           if (speed)
7524             {
7525               if (op0_stripped != newop0)
7526                 *cost += extra_cost->alu.log_shift;
7527               else
7528                 *cost += extra_cost->alu.logical;
7529             }
7530
7531           return true;
7532         }
7533       /* MVN.  */
7534       if (speed)
7535         *cost += extra_cost->alu.logical;
7536
7537       return false;
7538
7539     case ZERO_EXTEND:
7540
7541       op0 = XEXP (x, 0);
7542       /* If a value is written in SI mode, then zero extended to DI
7543          mode, the operation will in general be free as a write to
7544          a 'w' register implicitly zeroes the upper bits of an 'x'
7545          register.  However, if this is
7546
7547            (set (reg) (zero_extend (reg)))
7548
7549          we must cost the explicit register move.  */
7550       if (mode == DImode
7551           && GET_MODE (op0) == SImode
7552           && outer == SET)
7553         {
7554           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7555
7556         /* If OP_COST is non-zero, then the cost of the zero extend
7557            is effectively the cost of the inner operation.  Otherwise
7558            we have a MOV instruction and we take the cost from the MOV
7559            itself.  This is true independently of whether we are
7560            optimizing for space or time.  */
7561           if (op_cost)
7562             *cost = op_cost;
7563
7564           return true;
7565         }
7566       else if (MEM_P (op0))
7567         {
7568           /* All loads can zero extend to any size for free.  */
7569           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7570           return true;
7571         }
7572
7573       op0 = aarch64_extend_bitfield_pattern_p (x);
7574       if (op0)
7575         {
7576           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7577           if (speed)
7578             *cost += extra_cost->alu.bfx;
7579           return true;
7580         }
7581
7582       if (speed)
7583         {
7584           if (VECTOR_MODE_P (mode))
7585             {
7586               /* UMOV.  */
7587               *cost += extra_cost->vect.alu;
7588             }
7589           else
7590             {
7591               /* We generate an AND instead of UXTB/UXTH.  */
7592               *cost += extra_cost->alu.logical;
7593             }
7594         }
7595       return false;
7596
7597     case SIGN_EXTEND:
7598       if (MEM_P (XEXP (x, 0)))
7599         {
7600           /* LDRSH.  */
7601           if (speed)
7602             {
7603               rtx address = XEXP (XEXP (x, 0), 0);
7604               *cost += extra_cost->ldst.load_sign_extend;
7605
7606               *cost +=
7607                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7608                                                      0, speed));
7609             }
7610           return true;
7611         }
7612
7613       op0 = aarch64_extend_bitfield_pattern_p (x);
7614       if (op0)
7615         {
7616           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7617           if (speed)
7618             *cost += extra_cost->alu.bfx;
7619           return true;
7620         }
7621
7622       if (speed)
7623         {
7624           if (VECTOR_MODE_P (mode))
7625             *cost += extra_cost->vect.alu;
7626           else
7627             *cost += extra_cost->alu.extend;
7628         }
7629       return false;
7630
7631     case ASHIFT:
7632       op0 = XEXP (x, 0);
7633       op1 = XEXP (x, 1);
7634
7635       if (CONST_INT_P (op1))
7636         {
7637           if (speed)
7638             {
7639               if (VECTOR_MODE_P (mode))
7640                 {
7641                   /* Vector shift (immediate).  */
7642                   *cost += extra_cost->vect.alu;
7643                 }
7644               else
7645                 {
7646                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7647                      aliases.  */
7648                   *cost += extra_cost->alu.shift;
7649                 }
7650             }
7651
7652           /* We can incorporate zero/sign extend for free.  */
7653           if (GET_CODE (op0) == ZERO_EXTEND
7654               || GET_CODE (op0) == SIGN_EXTEND)
7655             op0 = XEXP (op0, 0);
7656
7657           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7658           return true;
7659         }
7660       else
7661         {
7662           if (VECTOR_MODE_P (mode))
7663             {
7664               if (speed)
7665                 /* Vector shift (register).  */
7666                 *cost += extra_cost->vect.alu;
7667             }
7668           else
7669             {
7670               if (speed)
7671                 /* LSLV.  */
7672                 *cost += extra_cost->alu.shift_reg;
7673
7674               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7675                   && CONST_INT_P (XEXP (op1, 1))
7676                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7677                 {
7678                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7679                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7680                      don't recurse into it.  */
7681                   return true;
7682                 }
7683             }
7684           return false;  /* All arguments need to be in registers.  */
7685         }
7686
7687     case ROTATE:
7688     case ROTATERT:
7689     case LSHIFTRT:
7690     case ASHIFTRT:
7691       op0 = XEXP (x, 0);
7692       op1 = XEXP (x, 1);
7693
7694       if (CONST_INT_P (op1))
7695         {
7696           /* ASR (immediate) and friends.  */
7697           if (speed)
7698             {
7699               if (VECTOR_MODE_P (mode))
7700                 *cost += extra_cost->vect.alu;
7701               else
7702                 *cost += extra_cost->alu.shift;
7703             }
7704
7705           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7706           return true;
7707         }
7708       else
7709         {
7710           if (VECTOR_MODE_P (mode))
7711             {
7712               if (speed)
7713                 /* Vector shift (register).  */
7714                 *cost += extra_cost->vect.alu;
7715             }
7716           else
7717             {
7718               if (speed)
7719                 /* ASR (register) and friends.  */
7720                 *cost += extra_cost->alu.shift_reg;
7721
7722               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
7723                   && CONST_INT_P (XEXP (op1, 1))
7724                   && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
7725                 {
7726                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
7727                   /* We already demanded XEXP (op1, 0) to be REG_P, so
7728                      don't recurse into it.  */
7729                   return true;
7730                 }
7731             }
7732           return false;  /* All arguments need to be in registers.  */
7733         }
7734
7735     case SYMBOL_REF:
7736
7737       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7738           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7739         {
7740           /* LDR.  */
7741           if (speed)
7742             *cost += extra_cost->ldst.load;
7743         }
7744       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7745                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7746         {
7747           /* ADRP, followed by ADD.  */
7748           *cost += COSTS_N_INSNS (1);
7749           if (speed)
7750             *cost += 2 * extra_cost->alu.arith;
7751         }
7752       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7753                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7754         {
7755           /* ADR.  */
7756           if (speed)
7757             *cost += extra_cost->alu.arith;
7758         }
7759
7760       if (flag_pic)
7761         {
7762           /* One extra load instruction, after accessing the GOT.  */
7763           *cost += COSTS_N_INSNS (1);
7764           if (speed)
7765             *cost += extra_cost->ldst.load;
7766         }
7767       return true;
7768
7769     case HIGH:
7770     case LO_SUM:
7771       /* ADRP/ADD (immediate).  */
7772       if (speed)
7773         *cost += extra_cost->alu.arith;
7774       return true;
7775
7776     case ZERO_EXTRACT:
7777     case SIGN_EXTRACT:
7778       /* UBFX/SBFX.  */
7779       if (speed)
7780         {
7781           if (VECTOR_MODE_P (mode))
7782             *cost += extra_cost->vect.alu;
7783           else
7784             *cost += extra_cost->alu.bfx;
7785         }
7786
7787       /* We can trust that the immediates used will be correct (there
7788          are no by-register forms), so we need only cost op0.  */
7789       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7790       return true;
7791
7792     case MULT:
7793       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7794       /* aarch64_rtx_mult_cost always handles recursion to its
7795          operands.  */
7796       return true;
7797
7798     case MOD:
7799     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7800        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7801        an unconditional negate.  This case should only ever be reached through
7802        the set_smod_pow2_cheap check in expmed.c.  */
7803       if (CONST_INT_P (XEXP (x, 1))
7804           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7805           && (mode == SImode || mode == DImode))
7806         {
7807           /* We expand to 4 instructions.  Reset the baseline.  */
7808           *cost = COSTS_N_INSNS (4);
7809
7810           if (speed)
7811             *cost += 2 * extra_cost->alu.logical
7812                      + 2 * extra_cost->alu.arith;
7813
7814           return true;
7815         }
7816
7817     /* Fall-through.  */
7818     case UMOD:
7819       if (speed)
7820         {
7821           /* Slighly prefer UMOD over SMOD.  */
7822           if (VECTOR_MODE_P (mode))
7823             *cost += extra_cost->vect.alu;
7824           else if (GET_MODE_CLASS (mode) == MODE_INT)
7825             *cost += (extra_cost->mult[mode == DImode].add
7826                       + extra_cost->mult[mode == DImode].idiv
7827                       + (code == MOD ? 1 : 0));
7828         }
7829       return false;  /* All arguments need to be in registers.  */
7830
7831     case DIV:
7832     case UDIV:
7833     case SQRT:
7834       if (speed)
7835         {
7836           if (VECTOR_MODE_P (mode))
7837             *cost += extra_cost->vect.alu;
7838           else if (GET_MODE_CLASS (mode) == MODE_INT)
7839             /* There is no integer SQRT, so only DIV and UDIV can get
7840                here.  */
7841             *cost += (extra_cost->mult[mode == DImode].idiv
7842                      /* Slighly prefer UDIV over SDIV.  */
7843                      + (code == DIV ? 1 : 0));
7844           else
7845             *cost += extra_cost->fp[mode == DFmode].div;
7846         }
7847       return false;  /* All arguments need to be in registers.  */
7848
7849     case IF_THEN_ELSE:
7850       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7851                                          XEXP (x, 2), cost, speed);
7852
7853     case EQ:
7854     case NE:
7855     case GT:
7856     case GTU:
7857     case LT:
7858     case LTU:
7859     case GE:
7860     case GEU:
7861     case LE:
7862     case LEU:
7863
7864       return false; /* All arguments must be in registers.  */
7865
7866     case FMA:
7867       op0 = XEXP (x, 0);
7868       op1 = XEXP (x, 1);
7869       op2 = XEXP (x, 2);
7870
7871       if (speed)
7872         {
7873           if (VECTOR_MODE_P (mode))
7874             *cost += extra_cost->vect.alu;
7875           else
7876             *cost += extra_cost->fp[mode == DFmode].fma;
7877         }
7878
7879       /* FMSUB, FNMADD, and FNMSUB are free.  */
7880       if (GET_CODE (op0) == NEG)
7881         op0 = XEXP (op0, 0);
7882
7883       if (GET_CODE (op2) == NEG)
7884         op2 = XEXP (op2, 0);
7885
7886       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7887          and the by-element operand as operand 0.  */
7888       if (GET_CODE (op1) == NEG)
7889         op1 = XEXP (op1, 0);
7890
7891       /* Catch vector-by-element operations.  The by-element operand can
7892          either be (vec_duplicate (vec_select (x))) or just
7893          (vec_select (x)), depending on whether we are multiplying by
7894          a vector or a scalar.
7895
7896          Canonicalization is not very good in these cases, FMA4 will put the
7897          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7898       if (GET_CODE (op0) == VEC_DUPLICATE)
7899         op0 = XEXP (op0, 0);
7900       else if (GET_CODE (op1) == VEC_DUPLICATE)
7901         op1 = XEXP (op1, 0);
7902
7903       if (GET_CODE (op0) == VEC_SELECT)
7904         op0 = XEXP (op0, 0);
7905       else if (GET_CODE (op1) == VEC_SELECT)
7906         op1 = XEXP (op1, 0);
7907
7908       /* If the remaining parameters are not registers,
7909          get the cost to put them into registers.  */
7910       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7911       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7912       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7913       return true;
7914
7915     case FLOAT:
7916     case UNSIGNED_FLOAT:
7917       if (speed)
7918         *cost += extra_cost->fp[mode == DFmode].fromint;
7919       return false;
7920
7921     case FLOAT_EXTEND:
7922       if (speed)
7923         {
7924           if (VECTOR_MODE_P (mode))
7925             {
7926               /*Vector truncate.  */
7927               *cost += extra_cost->vect.alu;
7928             }
7929           else
7930             *cost += extra_cost->fp[mode == DFmode].widen;
7931         }
7932       return false;
7933
7934     case FLOAT_TRUNCATE:
7935       if (speed)
7936         {
7937           if (VECTOR_MODE_P (mode))
7938             {
7939               /*Vector conversion.  */
7940               *cost += extra_cost->vect.alu;
7941             }
7942           else
7943             *cost += extra_cost->fp[mode == DFmode].narrow;
7944         }
7945       return false;
7946
7947     case FIX:
7948     case UNSIGNED_FIX:
7949       x = XEXP (x, 0);
7950       /* Strip the rounding part.  They will all be implemented
7951          by the fcvt* family of instructions anyway.  */
7952       if (GET_CODE (x) == UNSPEC)
7953         {
7954           unsigned int uns_code = XINT (x, 1);
7955
7956           if (uns_code == UNSPEC_FRINTA
7957               || uns_code == UNSPEC_FRINTM
7958               || uns_code == UNSPEC_FRINTN
7959               || uns_code == UNSPEC_FRINTP
7960               || uns_code == UNSPEC_FRINTZ)
7961             x = XVECEXP (x, 0, 0);
7962         }
7963
7964       if (speed)
7965         {
7966           if (VECTOR_MODE_P (mode))
7967             *cost += extra_cost->vect.alu;
7968           else
7969             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7970         }
7971
7972       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7973          fixed-point fcvt.  */
7974       if (GET_CODE (x) == MULT
7975           && ((VECTOR_MODE_P (mode)
7976                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7977               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7978         {
7979           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7980                              0, speed);
7981           return true;
7982         }
7983
7984       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7985       return true;
7986
7987     case ABS:
7988       if (VECTOR_MODE_P (mode))
7989         {
7990           /* ABS (vector).  */
7991           if (speed)
7992             *cost += extra_cost->vect.alu;
7993         }
7994       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7995         {
7996           op0 = XEXP (x, 0);
7997
7998           /* FABD, which is analogous to FADD.  */
7999           if (GET_CODE (op0) == MINUS)
8000             {
8001               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
8002               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
8003               if (speed)
8004                 *cost += extra_cost->fp[mode == DFmode].addsub;
8005
8006               return true;
8007             }
8008           /* Simple FABS is analogous to FNEG.  */
8009           if (speed)
8010             *cost += extra_cost->fp[mode == DFmode].neg;
8011         }
8012       else
8013         {
8014           /* Integer ABS will either be split to
8015              two arithmetic instructions, or will be an ABS
8016              (scalar), which we don't model.  */
8017           *cost = COSTS_N_INSNS (2);
8018           if (speed)
8019             *cost += 2 * extra_cost->alu.arith;
8020         }
8021       return false;
8022
8023     case SMAX:
8024     case SMIN:
8025       if (speed)
8026         {
8027           if (VECTOR_MODE_P (mode))
8028             *cost += extra_cost->vect.alu;
8029           else
8030             {
8031               /* FMAXNM/FMINNM/FMAX/FMIN.
8032                  TODO: This may not be accurate for all implementations, but
8033                  we do not model this in the cost tables.  */
8034               *cost += extra_cost->fp[mode == DFmode].addsub;
8035             }
8036         }
8037       return false;
8038
8039     case UNSPEC:
8040       /* The floating point round to integer frint* instructions.  */
8041       if (aarch64_frint_unspec_p (XINT (x, 1)))
8042         {
8043           if (speed)
8044             *cost += extra_cost->fp[mode == DFmode].roundint;
8045
8046           return false;
8047         }
8048
8049       if (XINT (x, 1) == UNSPEC_RBIT)
8050         {
8051           if (speed)
8052             *cost += extra_cost->alu.rev;
8053
8054           return false;
8055         }
8056       break;
8057
8058     case TRUNCATE:
8059
8060       /* Decompose <su>muldi3_highpart.  */
8061       if (/* (truncate:DI  */
8062           mode == DImode
8063           /*   (lshiftrt:TI  */
8064           && GET_MODE (XEXP (x, 0)) == TImode
8065           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
8066           /*      (mult:TI  */
8067           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
8068           /*        (ANY_EXTEND:TI (reg:DI))
8069                     (ANY_EXTEND:TI (reg:DI)))  */
8070           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
8071                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
8072               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
8073                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
8074           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
8075           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
8076           /*     (const_int 64)  */
8077           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8078           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
8079         {
8080           /* UMULH/SMULH.  */
8081           if (speed)
8082             *cost += extra_cost->mult[mode == DImode].extend;
8083           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
8084                              mode, MULT, 0, speed);
8085           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
8086                              mode, MULT, 1, speed);
8087           return true;
8088         }
8089
8090       /* Fall through.  */
8091     default:
8092       break;
8093     }
8094
8095   if (dump_file
8096       && flag_aarch64_verbose_cost)
8097     fprintf (dump_file,
8098       "\nFailed to cost RTX.  Assuming default cost.\n");
8099
8100   return true;
8101 }
8102
8103 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8104    calculated for X.  This cost is stored in *COST.  Returns true
8105    if the total cost of X was calculated.  */
8106 static bool
8107 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
8108                    int param, int *cost, bool speed)
8109 {
8110   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
8111
8112   if (dump_file
8113       && flag_aarch64_verbose_cost)
8114     {
8115       print_rtl_single (dump_file, x);
8116       fprintf (dump_file, "\n%s cost: %d (%s)\n",
8117                speed ? "Hot" : "Cold",
8118                *cost, result ? "final" : "partial");
8119     }
8120
8121   return result;
8122 }
8123
8124 static int
8125 aarch64_register_move_cost (machine_mode mode,
8126                             reg_class_t from_i, reg_class_t to_i)
8127 {
8128   enum reg_class from = (enum reg_class) from_i;
8129   enum reg_class to = (enum reg_class) to_i;
8130   const struct cpu_regmove_cost *regmove_cost
8131     = aarch64_tune_params.regmove_cost;
8132
8133   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
8134   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
8135     to = GENERAL_REGS;
8136
8137   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
8138     from = GENERAL_REGS;
8139
8140   /* Moving between GPR and stack cost is the same as GP2GP.  */
8141   if ((from == GENERAL_REGS && to == STACK_REG)
8142       || (to == GENERAL_REGS && from == STACK_REG))
8143     return regmove_cost->GP2GP;
8144
8145   /* To/From the stack register, we move via the gprs.  */
8146   if (to == STACK_REG || from == STACK_REG)
8147     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
8148             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
8149
8150   if (GET_MODE_SIZE (mode) == 16)
8151     {
8152       /* 128-bit operations on general registers require 2 instructions.  */
8153       if (from == GENERAL_REGS && to == GENERAL_REGS)
8154         return regmove_cost->GP2GP * 2;
8155       else if (from == GENERAL_REGS)
8156         return regmove_cost->GP2FP * 2;
8157       else if (to == GENERAL_REGS)
8158         return regmove_cost->FP2GP * 2;
8159
8160       /* When AdvSIMD instructions are disabled it is not possible to move
8161          a 128-bit value directly between Q registers.  This is handled in
8162          secondary reload.  A general register is used as a scratch to move
8163          the upper DI value and the lower DI value is moved directly,
8164          hence the cost is the sum of three moves. */
8165       if (! TARGET_SIMD)
8166         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
8167
8168       return regmove_cost->FP2FP;
8169     }
8170
8171   if (from == GENERAL_REGS && to == GENERAL_REGS)
8172     return regmove_cost->GP2GP;
8173   else if (from == GENERAL_REGS)
8174     return regmove_cost->GP2FP;
8175   else if (to == GENERAL_REGS)
8176     return regmove_cost->FP2GP;
8177
8178   return regmove_cost->FP2FP;
8179 }
8180
8181 static int
8182 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
8183                           reg_class_t rclass ATTRIBUTE_UNUSED,
8184                           bool in ATTRIBUTE_UNUSED)
8185 {
8186   return aarch64_tune_params.memmov_cost;
8187 }
8188
8189 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8190    to optimize 1.0/sqrt.  */
8191
8192 static bool
8193 use_rsqrt_p (machine_mode mode)
8194 {
8195   return (!flag_trapping_math
8196           && flag_unsafe_math_optimizations
8197           && ((aarch64_tune_params.approx_modes->recip_sqrt
8198                & AARCH64_APPROX_MODE (mode))
8199               || flag_mrecip_low_precision_sqrt));
8200 }
8201
8202 /* Function to decide when to use the approximate reciprocal square root
8203    builtin.  */
8204
8205 static tree
8206 aarch64_builtin_reciprocal (tree fndecl)
8207 {
8208   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
8209
8210   if (!use_rsqrt_p (mode))
8211     return NULL_TREE;
8212   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
8213 }
8214
8215 typedef rtx (*rsqrte_type) (rtx, rtx);
8216
8217 /* Select reciprocal square root initial estimate insn depending on machine
8218    mode.  */
8219
8220 static rsqrte_type
8221 get_rsqrte_type (machine_mode mode)
8222 {
8223   switch (mode)
8224   {
8225     case E_DFmode:   return gen_aarch64_rsqrtedf;
8226     case E_SFmode:   return gen_aarch64_rsqrtesf;
8227     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
8228     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
8229     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
8230     default: gcc_unreachable ();
8231   }
8232 }
8233
8234 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
8235
8236 /* Select reciprocal square root series step insn depending on machine mode.  */
8237
8238 static rsqrts_type
8239 get_rsqrts_type (machine_mode mode)
8240 {
8241   switch (mode)
8242   {
8243     case E_DFmode:   return gen_aarch64_rsqrtsdf;
8244     case E_SFmode:   return gen_aarch64_rsqrtssf;
8245     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
8246     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
8247     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
8248     default: gcc_unreachable ();
8249   }
8250 }
8251
8252 /* Emit instruction sequence to compute either the approximate square root
8253    or its approximate reciprocal, depending on the flag RECP, and return
8254    whether the sequence was emitted or not.  */
8255
8256 bool
8257 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
8258 {
8259   machine_mode mode = GET_MODE (dst);
8260
8261   if (GET_MODE_INNER (mode) == HFmode)
8262     {
8263       gcc_assert (!recp);
8264       return false;
8265     }
8266
8267   machine_mode mmsk
8268     = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
8269                        GET_MODE_NUNITS (mode));
8270   if (!recp)
8271     {
8272       if (!(flag_mlow_precision_sqrt
8273             || (aarch64_tune_params.approx_modes->sqrt
8274                 & AARCH64_APPROX_MODE (mode))))
8275         return false;
8276
8277       if (flag_finite_math_only
8278           || flag_trapping_math
8279           || !flag_unsafe_math_optimizations
8280           || optimize_function_for_size_p (cfun))
8281         return false;
8282     }
8283   else
8284     /* Caller assumes we cannot fail.  */
8285     gcc_assert (use_rsqrt_p (mode));
8286
8287
8288   rtx xmsk = gen_reg_rtx (mmsk);
8289   if (!recp)
8290     /* When calculating the approximate square root, compare the
8291        argument with 0.0 and create a mask.  */
8292     emit_insn (gen_rtx_SET (xmsk,
8293                             gen_rtx_NEG (mmsk,
8294                                          gen_rtx_EQ (mmsk, src,
8295                                                      CONST0_RTX (mode)))));
8296
8297   /* Estimate the approximate reciprocal square root.  */
8298   rtx xdst = gen_reg_rtx (mode);
8299   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8300
8301   /* Iterate over the series twice for SF and thrice for DF.  */
8302   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8303
8304   /* Optionally iterate over the series once less for faster performance
8305      while sacrificing the accuracy.  */
8306   if ((recp && flag_mrecip_low_precision_sqrt)
8307       || (!recp && flag_mlow_precision_sqrt))
8308     iterations--;
8309
8310   /* Iterate over the series to calculate the approximate reciprocal square
8311      root.  */
8312   rtx x1 = gen_reg_rtx (mode);
8313   while (iterations--)
8314     {
8315       rtx x2 = gen_reg_rtx (mode);
8316       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8317
8318       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8319
8320       if (iterations > 0)
8321         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8322     }
8323
8324   if (!recp)
8325     {
8326       /* Qualify the approximate reciprocal square root when the argument is
8327          0.0 by squashing the intermediary result to 0.0.  */
8328       rtx xtmp = gen_reg_rtx (mmsk);
8329       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8330                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
8331       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8332
8333       /* Calculate the approximate square root.  */
8334       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8335     }
8336
8337   /* Finalize the approximation.  */
8338   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8339
8340   return true;
8341 }
8342
8343 typedef rtx (*recpe_type) (rtx, rtx);
8344
8345 /* Select reciprocal initial estimate insn depending on machine mode.  */
8346
8347 static recpe_type
8348 get_recpe_type (machine_mode mode)
8349 {
8350   switch (mode)
8351   {
8352     case E_SFmode:   return (gen_aarch64_frecpesf);
8353     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
8354     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
8355     case E_DFmode:   return (gen_aarch64_frecpedf);
8356     case E_V2DFmode: return (gen_aarch64_frecpev2df);
8357     default:         gcc_unreachable ();
8358   }
8359 }
8360
8361 typedef rtx (*recps_type) (rtx, rtx, rtx);
8362
8363 /* Select reciprocal series step insn depending on machine mode.  */
8364
8365 static recps_type
8366 get_recps_type (machine_mode mode)
8367 {
8368   switch (mode)
8369   {
8370     case E_SFmode:   return (gen_aarch64_frecpssf);
8371     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
8372     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
8373     case E_DFmode:   return (gen_aarch64_frecpsdf);
8374     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
8375     default:         gcc_unreachable ();
8376   }
8377 }
8378
8379 /* Emit the instruction sequence to compute the approximation for the division
8380    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
8381
8382 bool
8383 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8384 {
8385   machine_mode mode = GET_MODE (quo);
8386
8387   if (GET_MODE_INNER (mode) == HFmode)
8388     return false;
8389
8390   bool use_approx_division_p = (flag_mlow_precision_div
8391                                 || (aarch64_tune_params.approx_modes->division
8392                                     & AARCH64_APPROX_MODE (mode)));
8393
8394   if (!flag_finite_math_only
8395       || flag_trapping_math
8396       || !flag_unsafe_math_optimizations
8397       || optimize_function_for_size_p (cfun)
8398       || !use_approx_division_p)
8399     return false;
8400
8401   /* Estimate the approximate reciprocal.  */
8402   rtx xrcp = gen_reg_rtx (mode);
8403   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8404
8405   /* Iterate over the series twice for SF and thrice for DF.  */
8406   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8407
8408   /* Optionally iterate over the series once less for faster performance,
8409      while sacrificing the accuracy.  */
8410   if (flag_mlow_precision_div)
8411     iterations--;
8412
8413   /* Iterate over the series to calculate the approximate reciprocal.  */
8414   rtx xtmp = gen_reg_rtx (mode);
8415   while (iterations--)
8416     {
8417       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8418
8419       if (iterations > 0)
8420         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8421     }
8422
8423   if (num != CONST1_RTX (mode))
8424     {
8425       /* As the approximate reciprocal of DEN is already calculated, only
8426          calculate the approximate division when NUM is not 1.0.  */
8427       rtx xnum = force_reg (mode, num);
8428       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8429     }
8430
8431   /* Finalize the approximation.  */
8432   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8433   return true;
8434 }
8435
8436 /* Return the number of instructions that can be issued per cycle.  */
8437 static int
8438 aarch64_sched_issue_rate (void)
8439 {
8440   return aarch64_tune_params.issue_rate;
8441 }
8442
8443 static int
8444 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8445 {
8446   int issue_rate = aarch64_sched_issue_rate ();
8447
8448   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8449 }
8450
8451
8452 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8453    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8454    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8455
8456 static int
8457 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8458                                                     int ready_index)
8459 {
8460   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8461 }
8462
8463
8464 /* Vectorizer cost model target hooks.  */
8465
8466 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8467 static int
8468 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8469                                     tree vectype,
8470                                     int misalign ATTRIBUTE_UNUSED)
8471 {
8472   unsigned elements;
8473   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8474   bool fp = false;
8475
8476   if (vectype != NULL)
8477     fp = FLOAT_TYPE_P (vectype);
8478
8479   switch (type_of_cost)
8480     {
8481       case scalar_stmt:
8482         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8483
8484       case scalar_load:
8485         return costs->scalar_load_cost;
8486
8487       case scalar_store:
8488         return costs->scalar_store_cost;
8489
8490       case vector_stmt:
8491         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8492
8493       case vector_load:
8494         return costs->vec_align_load_cost;
8495
8496       case vector_store:
8497         return costs->vec_store_cost;
8498
8499       case vec_to_scalar:
8500         return costs->vec_to_scalar_cost;
8501
8502       case scalar_to_vec:
8503         return costs->scalar_to_vec_cost;
8504
8505       case unaligned_load:
8506         return costs->vec_unalign_load_cost;
8507
8508       case unaligned_store:
8509         return costs->vec_unalign_store_cost;
8510
8511       case cond_branch_taken:
8512         return costs->cond_taken_branch_cost;
8513
8514       case cond_branch_not_taken:
8515         return costs->cond_not_taken_branch_cost;
8516
8517       case vec_perm:
8518         return costs->vec_permute_cost;
8519
8520       case vec_promote_demote:
8521         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8522
8523       case vec_construct:
8524         elements = TYPE_VECTOR_SUBPARTS (vectype);
8525         return elements / 2 + 1;
8526
8527       default:
8528         gcc_unreachable ();
8529     }
8530 }
8531
8532 /* Implement targetm.vectorize.add_stmt_cost.  */
8533 static unsigned
8534 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8535                        struct _stmt_vec_info *stmt_info, int misalign,
8536                        enum vect_cost_model_location where)
8537 {
8538   unsigned *cost = (unsigned *) data;
8539   unsigned retval = 0;
8540
8541   if (flag_vect_cost_model)
8542     {
8543       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8544       int stmt_cost =
8545             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8546
8547       /* Statements in an inner loop relative to the loop being
8548          vectorized are weighted more heavily.  The value here is
8549          arbitrary and could potentially be improved with analysis.  */
8550       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8551         count *= 50; /*  FIXME  */
8552
8553       retval = (unsigned) (count * stmt_cost);
8554       cost[where] += retval;
8555     }
8556
8557   return retval;
8558 }
8559
8560 static void initialize_aarch64_code_model (struct gcc_options *);
8561
8562 /* Parse the TO_PARSE string and put the architecture struct that it
8563    selects into RES and the architectural features into ISA_FLAGS.
8564    Return an aarch64_parse_opt_result describing the parse result.
8565    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8566
8567 static enum aarch64_parse_opt_result
8568 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8569                     unsigned long *isa_flags)
8570 {
8571   char *ext;
8572   const struct processor *arch;
8573   char *str = (char *) alloca (strlen (to_parse) + 1);
8574   size_t len;
8575
8576   strcpy (str, to_parse);
8577
8578   ext = strchr (str, '+');
8579
8580   if (ext != NULL)
8581     len = ext - str;
8582   else
8583     len = strlen (str);
8584
8585   if (len == 0)
8586     return AARCH64_PARSE_MISSING_ARG;
8587
8588
8589   /* Loop through the list of supported ARCHes to find a match.  */
8590   for (arch = all_architectures; arch->name != NULL; arch++)
8591     {
8592       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8593         {
8594           unsigned long isa_temp = arch->flags;
8595
8596           if (ext != NULL)
8597             {
8598               /* TO_PARSE string contains at least one extension.  */
8599               enum aarch64_parse_opt_result ext_res
8600                 = aarch64_parse_extension (ext, &isa_temp);
8601
8602               if (ext_res != AARCH64_PARSE_OK)
8603                 return ext_res;
8604             }
8605           /* Extension parsing was successful.  Confirm the result
8606              arch and ISA flags.  */
8607           *res = arch;
8608           *isa_flags = isa_temp;
8609           return AARCH64_PARSE_OK;
8610         }
8611     }
8612
8613   /* ARCH name not found in list.  */
8614   return AARCH64_PARSE_INVALID_ARG;
8615 }
8616
8617 /* Parse the TO_PARSE string and put the result tuning in RES and the
8618    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8619    describing the parse result.  If there is an error parsing, RES and
8620    ISA_FLAGS are left unchanged.  */
8621
8622 static enum aarch64_parse_opt_result
8623 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8624                    unsigned long *isa_flags)
8625 {
8626   char *ext;
8627   const struct processor *cpu;
8628   char *str = (char *) alloca (strlen (to_parse) + 1);
8629   size_t len;
8630
8631   strcpy (str, to_parse);
8632
8633   ext = strchr (str, '+');
8634
8635   if (ext != NULL)
8636     len = ext - str;
8637   else
8638     len = strlen (str);
8639
8640   if (len == 0)
8641     return AARCH64_PARSE_MISSING_ARG;
8642
8643
8644   /* Loop through the list of supported CPUs to find a match.  */
8645   for (cpu = all_cores; cpu->name != NULL; cpu++)
8646     {
8647       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8648         {
8649           unsigned long isa_temp = cpu->flags;
8650
8651
8652           if (ext != NULL)
8653             {
8654               /* TO_PARSE string contains at least one extension.  */
8655               enum aarch64_parse_opt_result ext_res
8656                 = aarch64_parse_extension (ext, &isa_temp);
8657
8658               if (ext_res != AARCH64_PARSE_OK)
8659                 return ext_res;
8660             }
8661           /* Extension parsing was successfull.  Confirm the result
8662              cpu and ISA flags.  */
8663           *res = cpu;
8664           *isa_flags = isa_temp;
8665           return AARCH64_PARSE_OK;
8666         }
8667     }
8668
8669   /* CPU name not found in list.  */
8670   return AARCH64_PARSE_INVALID_ARG;
8671 }
8672
8673 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8674    Return an aarch64_parse_opt_result describing the parse result.
8675    If the parsing fails the RES does not change.  */
8676
8677 static enum aarch64_parse_opt_result
8678 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8679 {
8680   const struct processor *cpu;
8681   char *str = (char *) alloca (strlen (to_parse) + 1);
8682
8683   strcpy (str, to_parse);
8684
8685   /* Loop through the list of supported CPUs to find a match.  */
8686   for (cpu = all_cores; cpu->name != NULL; cpu++)
8687     {
8688       if (strcmp (cpu->name, str) == 0)
8689         {
8690           *res = cpu;
8691           return AARCH64_PARSE_OK;
8692         }
8693     }
8694
8695   /* CPU name not found in list.  */
8696   return AARCH64_PARSE_INVALID_ARG;
8697 }
8698
8699 /* Parse TOKEN, which has length LENGTH to see if it is an option
8700    described in FLAG.  If it is, return the index bit for that fusion type.
8701    If not, error (printing OPTION_NAME) and return zero.  */
8702
8703 static unsigned int
8704 aarch64_parse_one_option_token (const char *token,
8705                                 size_t length,
8706                                 const struct aarch64_flag_desc *flag,
8707                                 const char *option_name)
8708 {
8709   for (; flag->name != NULL; flag++)
8710     {
8711       if (length == strlen (flag->name)
8712           && !strncmp (flag->name, token, length))
8713         return flag->flag;
8714     }
8715
8716   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8717   return 0;
8718 }
8719
8720 /* Parse OPTION which is a comma-separated list of flags to enable.
8721    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8722    default state we inherit from the CPU tuning structures.  OPTION_NAME
8723    gives the top-level option we are parsing in the -moverride string,
8724    for use in error messages.  */
8725
8726 static unsigned int
8727 aarch64_parse_boolean_options (const char *option,
8728                                const struct aarch64_flag_desc *flags,
8729                                unsigned int initial_state,
8730                                const char *option_name)
8731 {
8732   const char separator = '.';
8733   const char* specs = option;
8734   const char* ntoken = option;
8735   unsigned int found_flags = initial_state;
8736
8737   while ((ntoken = strchr (specs, separator)))
8738     {
8739       size_t token_length = ntoken - specs;
8740       unsigned token_ops = aarch64_parse_one_option_token (specs,
8741                                                            token_length,
8742                                                            flags,
8743                                                            option_name);
8744       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8745          in the token stream, reset the supported operations.  So:
8746
8747            adrp+add.cmp+branch.none.adrp+add
8748
8749            would have the result of turning on only adrp+add fusion.  */
8750       if (!token_ops)
8751         found_flags = 0;
8752
8753       found_flags |= token_ops;
8754       specs = ++ntoken;
8755     }
8756
8757   /* We ended with a comma, print something.  */
8758   if (!(*specs))
8759     {
8760       error ("%s string ill-formed\n", option_name);
8761       return 0;
8762     }
8763
8764   /* We still have one more token to parse.  */
8765   size_t token_length = strlen (specs);
8766   unsigned token_ops = aarch64_parse_one_option_token (specs,
8767                                                        token_length,
8768                                                        flags,
8769                                                        option_name);
8770    if (!token_ops)
8771      found_flags = 0;
8772
8773   found_flags |= token_ops;
8774   return found_flags;
8775 }
8776
8777 /* Support for overriding instruction fusion.  */
8778
8779 static void
8780 aarch64_parse_fuse_string (const char *fuse_string,
8781                             struct tune_params *tune)
8782 {
8783   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8784                                                      aarch64_fusible_pairs,
8785                                                      tune->fusible_ops,
8786                                                      "fuse=");
8787 }
8788
8789 /* Support for overriding other tuning flags.  */
8790
8791 static void
8792 aarch64_parse_tune_string (const char *tune_string,
8793                             struct tune_params *tune)
8794 {
8795   tune->extra_tuning_flags
8796     = aarch64_parse_boolean_options (tune_string,
8797                                      aarch64_tuning_flags,
8798                                      tune->extra_tuning_flags,
8799                                      "tune=");
8800 }
8801
8802 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8803    we understand.  If it is, extract the option string and handoff to
8804    the appropriate function.  */
8805
8806 void
8807 aarch64_parse_one_override_token (const char* token,
8808                                   size_t length,
8809                                   struct tune_params *tune)
8810 {
8811   const struct aarch64_tuning_override_function *fn
8812     = aarch64_tuning_override_functions;
8813
8814   const char *option_part = strchr (token, '=');
8815   if (!option_part)
8816     {
8817       error ("tuning string missing in option (%s)", token);
8818       return;
8819     }
8820
8821   /* Get the length of the option name.  */
8822   length = option_part - token;
8823   /* Skip the '=' to get to the option string.  */
8824   option_part++;
8825
8826   for (; fn->name != NULL; fn++)
8827     {
8828       if (!strncmp (fn->name, token, length))
8829         {
8830           fn->parse_override (option_part, tune);
8831           return;
8832         }
8833     }
8834
8835   error ("unknown tuning option (%s)",token);
8836   return;
8837 }
8838
8839 /* A checking mechanism for the implementation of the tls size.  */
8840
8841 static void
8842 initialize_aarch64_tls_size (struct gcc_options *opts)
8843 {
8844   if (aarch64_tls_size == 0)
8845     aarch64_tls_size = 24;
8846
8847   switch (opts->x_aarch64_cmodel_var)
8848     {
8849     case AARCH64_CMODEL_TINY:
8850       /* Both the default and maximum TLS size allowed under tiny is 1M which
8851          needs two instructions to address, so we clamp the size to 24.  */
8852       if (aarch64_tls_size > 24)
8853         aarch64_tls_size = 24;
8854       break;
8855     case AARCH64_CMODEL_SMALL:
8856       /* The maximum TLS size allowed under small is 4G.  */
8857       if (aarch64_tls_size > 32)
8858         aarch64_tls_size = 32;
8859       break;
8860     case AARCH64_CMODEL_LARGE:
8861       /* The maximum TLS size allowed under large is 16E.
8862          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8863       if (aarch64_tls_size > 48)
8864         aarch64_tls_size = 48;
8865       break;
8866     default:
8867       gcc_unreachable ();
8868     }
8869
8870   return;
8871 }
8872
8873 /* Parse STRING looking for options in the format:
8874      string     :: option:string
8875      option     :: name=substring
8876      name       :: {a-z}
8877      substring  :: defined by option.  */
8878
8879 static void
8880 aarch64_parse_override_string (const char* input_string,
8881                                struct tune_params* tune)
8882 {
8883   const char separator = ':';
8884   size_t string_length = strlen (input_string) + 1;
8885   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8886   char *string = string_root;
8887   strncpy (string, input_string, string_length);
8888   string[string_length - 1] = '\0';
8889
8890   char* ntoken = string;
8891
8892   while ((ntoken = strchr (string, separator)))
8893     {
8894       size_t token_length = ntoken - string;
8895       /* Make this substring look like a string.  */
8896       *ntoken = '\0';
8897       aarch64_parse_one_override_token (string, token_length, tune);
8898       string = ++ntoken;
8899     }
8900
8901   /* One last option to parse.  */
8902   aarch64_parse_one_override_token (string, strlen (string), tune);
8903   free (string_root);
8904 }
8905
8906
8907 static void
8908 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8909 {
8910   /* The logic here is that if we are disabling all frame pointer generation
8911      then we do not need to disable leaf frame pointer generation as a
8912      separate operation.  But if we are *only* disabling leaf frame pointer
8913      generation then we set flag_omit_frame_pointer to true, but in
8914      aarch64_frame_pointer_required we return false only for leaf functions.
8915
8916      PR 70044: We have to be careful about being called multiple times for the
8917      same function.  Once we have decided to set flag_omit_frame_pointer just
8918      so that we can omit leaf frame pointers, we must then not interpret a
8919      second call as meaning that all frame pointer generation should be
8920      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8921      non-zero value.  */
8922   if (opts->x_flag_omit_frame_pointer == 2)
8923     opts->x_flag_omit_frame_pointer = 0;
8924
8925   if (opts->x_flag_omit_frame_pointer)
8926     opts->x_flag_omit_leaf_frame_pointer = false;
8927   else if (opts->x_flag_omit_leaf_frame_pointer)
8928     opts->x_flag_omit_frame_pointer = 2;
8929
8930   /* If not optimizing for size, set the default
8931      alignment to what the target wants.  */
8932   if (!opts->x_optimize_size)
8933     {
8934       if (opts->x_align_loops <= 0)
8935         opts->x_align_loops = aarch64_tune_params.loop_align;
8936       if (opts->x_align_jumps <= 0)
8937         opts->x_align_jumps = aarch64_tune_params.jump_align;
8938       if (opts->x_align_functions <= 0)
8939         opts->x_align_functions = aarch64_tune_params.function_align;
8940     }
8941
8942   /* We default to no pc-relative literal loads.  */
8943
8944   aarch64_pcrelative_literal_loads = false;
8945
8946   /* If -mpc-relative-literal-loads is set on the command line, this
8947      implies that the user asked for PC relative literal loads.  */
8948   if (opts->x_pcrelative_literal_loads == 1)
8949     aarch64_pcrelative_literal_loads = true;
8950
8951   /* This is PR70113. When building the Linux kernel with
8952      CONFIG_ARM64_ERRATUM_843419, support for relocations
8953      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8954      removed from the kernel to avoid loading objects with possibly
8955      offending sequences.  Without -mpc-relative-literal-loads we would
8956      generate such relocations, preventing the kernel build from
8957      succeeding.  */
8958   if (opts->x_pcrelative_literal_loads == 2
8959       && TARGET_FIX_ERR_A53_843419)
8960     aarch64_pcrelative_literal_loads = true;
8961
8962   /* In the tiny memory model it makes no sense to disallow PC relative
8963      literal pool loads.  */
8964   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8965       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8966     aarch64_pcrelative_literal_loads = true;
8967
8968   /* When enabling the lower precision Newton series for the square root, also
8969      enable it for the reciprocal square root, since the latter is an
8970      intermediary step for the former.  */
8971   if (flag_mlow_precision_sqrt)
8972     flag_mrecip_low_precision_sqrt = true;
8973 }
8974
8975 /* 'Unpack' up the internal tuning structs and update the options
8976     in OPTS.  The caller must have set up selected_tune and selected_arch
8977     as all the other target-specific codegen decisions are
8978     derived from them.  */
8979
8980 void
8981 aarch64_override_options_internal (struct gcc_options *opts)
8982 {
8983   aarch64_tune_flags = selected_tune->flags;
8984   aarch64_tune = selected_tune->sched_core;
8985   /* Make a copy of the tuning parameters attached to the core, which
8986      we may later overwrite.  */
8987   aarch64_tune_params = *(selected_tune->tune);
8988   aarch64_architecture_version = selected_arch->architecture_version;
8989
8990   if (opts->x_aarch64_override_tune_string)
8991     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8992                                   &aarch64_tune_params);
8993
8994   /* This target defaults to strict volatile bitfields.  */
8995   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8996     opts->x_flag_strict_volatile_bitfields = 1;
8997
8998   initialize_aarch64_code_model (opts);
8999   initialize_aarch64_tls_size (opts);
9000
9001   int queue_depth = 0;
9002   switch (aarch64_tune_params.autoprefetcher_model)
9003     {
9004       case tune_params::AUTOPREFETCHER_OFF:
9005         queue_depth = -1;
9006         break;
9007       case tune_params::AUTOPREFETCHER_WEAK:
9008         queue_depth = 0;
9009         break;
9010       case tune_params::AUTOPREFETCHER_STRONG:
9011         queue_depth = max_insn_queue_index + 1;
9012         break;
9013       default:
9014         gcc_unreachable ();
9015     }
9016
9017   /* We don't mind passing in global_options_set here as we don't use
9018      the *options_set structs anyway.  */
9019   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
9020                          queue_depth,
9021                          opts->x_param_values,
9022                          global_options_set.x_param_values);
9023
9024   /* Set up parameters to be used in prefetching algorithm.  Do not
9025      override the defaults unless we are tuning for a core we have
9026      researched values for.  */
9027   if (aarch64_tune_params.prefetch->num_slots > 0)
9028     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
9029                            aarch64_tune_params.prefetch->num_slots,
9030                            opts->x_param_values,
9031                            global_options_set.x_param_values);
9032   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
9033     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
9034                            aarch64_tune_params.prefetch->l1_cache_size,
9035                            opts->x_param_values,
9036                            global_options_set.x_param_values);
9037   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
9038     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
9039                            aarch64_tune_params.prefetch->l1_cache_line_size,
9040                            opts->x_param_values,
9041                            global_options_set.x_param_values);
9042   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
9043     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
9044                            aarch64_tune_params.prefetch->l2_cache_size,
9045                            opts->x_param_values,
9046                            global_options_set.x_param_values);
9047
9048   /* Enable sw prefetching at specified optimization level for
9049      CPUS that have prefetch.  Lower optimization level threshold by 1
9050      when profiling is enabled.  */
9051   if (opts->x_flag_prefetch_loop_arrays < 0
9052       && !opts->x_optimize_size
9053       && aarch64_tune_params.prefetch->default_opt_level >= 0
9054       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
9055     opts->x_flag_prefetch_loop_arrays = 1;
9056
9057   aarch64_override_options_after_change_1 (opts);
9058 }
9059
9060 /* Print a hint with a suggestion for a core or architecture name that
9061    most closely resembles what the user passed in STR.  ARCH is true if
9062    the user is asking for an architecture name.  ARCH is false if the user
9063    is asking for a core name.  */
9064
9065 static void
9066 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
9067 {
9068   auto_vec<const char *> candidates;
9069   const struct processor *entry = arch ? all_architectures : all_cores;
9070   for (; entry->name != NULL; entry++)
9071     candidates.safe_push (entry->name);
9072   char *s;
9073   const char *hint = candidates_list_and_hint (str, s, candidates);
9074   if (hint)
9075     inform (input_location, "valid arguments are: %s;"
9076                              " did you mean %qs?", s, hint);
9077   XDELETEVEC (s);
9078 }
9079
9080 /* Print a hint with a suggestion for a core name that most closely resembles
9081    what the user passed in STR.  */
9082
9083 inline static void
9084 aarch64_print_hint_for_core (const char *str)
9085 {
9086   aarch64_print_hint_for_core_or_arch (str, false);
9087 }
9088
9089 /* Print a hint with a suggestion for an architecture name that most closely
9090    resembles what the user passed in STR.  */
9091
9092 inline static void
9093 aarch64_print_hint_for_arch (const char *str)
9094 {
9095   aarch64_print_hint_for_core_or_arch (str, true);
9096 }
9097
9098 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
9099    specified in STR and throw errors if appropriate.  Put the results if
9100    they are valid in RES and ISA_FLAGS.  Return whether the option is
9101    valid.  */
9102
9103 static bool
9104 aarch64_validate_mcpu (const char *str, const struct processor **res,
9105                        unsigned long *isa_flags)
9106 {
9107   enum aarch64_parse_opt_result parse_res
9108     = aarch64_parse_cpu (str, res, isa_flags);
9109
9110   if (parse_res == AARCH64_PARSE_OK)
9111     return true;
9112
9113   switch (parse_res)
9114     {
9115       case AARCH64_PARSE_MISSING_ARG:
9116         error ("missing cpu name in %<-mcpu=%s%>", str);
9117         break;
9118       case AARCH64_PARSE_INVALID_ARG:
9119         error ("unknown value %qs for -mcpu", str);
9120         aarch64_print_hint_for_core (str);
9121         break;
9122       case AARCH64_PARSE_INVALID_FEATURE:
9123         error ("invalid feature modifier in %<-mcpu=%s%>", str);
9124         break;
9125       default:
9126         gcc_unreachable ();
9127     }
9128
9129   return false;
9130 }
9131
9132 /* Validate a command-line -march option.  Parse the arch and extensions
9133    (if any) specified in STR and throw errors if appropriate.  Put the
9134    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
9135    option is valid.  */
9136
9137 static bool
9138 aarch64_validate_march (const char *str, const struct processor **res,
9139                          unsigned long *isa_flags)
9140 {
9141   enum aarch64_parse_opt_result parse_res
9142     = aarch64_parse_arch (str, res, isa_flags);
9143
9144   if (parse_res == AARCH64_PARSE_OK)
9145     return true;
9146
9147   switch (parse_res)
9148     {
9149       case AARCH64_PARSE_MISSING_ARG:
9150         error ("missing arch name in %<-march=%s%>", str);
9151         break;
9152       case AARCH64_PARSE_INVALID_ARG:
9153         error ("unknown value %qs for -march", str);
9154         aarch64_print_hint_for_arch (str);
9155         break;
9156       case AARCH64_PARSE_INVALID_FEATURE:
9157         error ("invalid feature modifier in %<-march=%s%>", str);
9158         break;
9159       default:
9160         gcc_unreachable ();
9161     }
9162
9163   return false;
9164 }
9165
9166 /* Validate a command-line -mtune option.  Parse the cpu
9167    specified in STR and throw errors if appropriate.  Put the
9168    result, if it is valid, in RES.  Return whether the option is
9169    valid.  */
9170
9171 static bool
9172 aarch64_validate_mtune (const char *str, const struct processor **res)
9173 {
9174   enum aarch64_parse_opt_result parse_res
9175     = aarch64_parse_tune (str, res);
9176
9177   if (parse_res == AARCH64_PARSE_OK)
9178     return true;
9179
9180   switch (parse_res)
9181     {
9182       case AARCH64_PARSE_MISSING_ARG:
9183         error ("missing cpu name in %<-mtune=%s%>", str);
9184         break;
9185       case AARCH64_PARSE_INVALID_ARG:
9186         error ("unknown value %qs for -mtune", str);
9187         aarch64_print_hint_for_core (str);
9188         break;
9189       default:
9190         gcc_unreachable ();
9191     }
9192   return false;
9193 }
9194
9195 /* Return the CPU corresponding to the enum CPU.
9196    If it doesn't specify a cpu, return the default.  */
9197
9198 static const struct processor *
9199 aarch64_get_tune_cpu (enum aarch64_processor cpu)
9200 {
9201   if (cpu != aarch64_none)
9202     return &all_cores[cpu];
9203
9204   /* The & 0x3f is to extract the bottom 6 bits that encode the
9205      default cpu as selected by the --with-cpu GCC configure option
9206      in config.gcc.
9207      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9208      flags mechanism should be reworked to make it more sane.  */
9209   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9210 }
9211
9212 /* Return the architecture corresponding to the enum ARCH.
9213    If it doesn't specify a valid architecture, return the default.  */
9214
9215 static const struct processor *
9216 aarch64_get_arch (enum aarch64_arch arch)
9217 {
9218   if (arch != aarch64_no_arch)
9219     return &all_architectures[arch];
9220
9221   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
9222
9223   return &all_architectures[cpu->arch];
9224 }
9225
9226 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
9227    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9228    tuning structs.  In particular it must set selected_tune and
9229    aarch64_isa_flags that define the available ISA features and tuning
9230    decisions.  It must also set selected_arch as this will be used to
9231    output the .arch asm tags for each function.  */
9232
9233 static void
9234 aarch64_override_options (void)
9235 {
9236   unsigned long cpu_isa = 0;
9237   unsigned long arch_isa = 0;
9238   aarch64_isa_flags = 0;
9239
9240   bool valid_cpu = true;
9241   bool valid_tune = true;
9242   bool valid_arch = true;
9243
9244   selected_cpu = NULL;
9245   selected_arch = NULL;
9246   selected_tune = NULL;
9247
9248   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9249      If either of -march or -mtune is given, they override their
9250      respective component of -mcpu.  */
9251   if (aarch64_cpu_string)
9252     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
9253                                         &cpu_isa);
9254
9255   if (aarch64_arch_string)
9256     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
9257                                           &arch_isa);
9258
9259   if (aarch64_tune_string)
9260     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
9261
9262   /* If the user did not specify a processor, choose the default
9263      one for them.  This will be the CPU set during configuration using
9264      --with-cpu, otherwise it is "generic".  */
9265   if (!selected_cpu)
9266     {
9267       if (selected_arch)
9268         {
9269           selected_cpu = &all_cores[selected_arch->ident];
9270           aarch64_isa_flags = arch_isa;
9271           explicit_arch = selected_arch->arch;
9272         }
9273       else
9274         {
9275           /* Get default configure-time CPU.  */
9276           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
9277           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
9278         }
9279
9280       if (selected_tune)
9281         explicit_tune_core = selected_tune->ident;
9282     }
9283   /* If both -mcpu and -march are specified check that they are architecturally
9284      compatible, warn if they're not and prefer the -march ISA flags.  */
9285   else if (selected_arch)
9286     {
9287       if (selected_arch->arch != selected_cpu->arch)
9288         {
9289           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9290                        all_architectures[selected_cpu->arch].name,
9291                        selected_arch->name);
9292         }
9293       aarch64_isa_flags = arch_isa;
9294       explicit_arch = selected_arch->arch;
9295       explicit_tune_core = selected_tune ? selected_tune->ident
9296                                           : selected_cpu->ident;
9297     }
9298   else
9299     {
9300       /* -mcpu but no -march.  */
9301       aarch64_isa_flags = cpu_isa;
9302       explicit_tune_core = selected_tune ? selected_tune->ident
9303                                           : selected_cpu->ident;
9304       gcc_assert (selected_cpu);
9305       selected_arch = &all_architectures[selected_cpu->arch];
9306       explicit_arch = selected_arch->arch;
9307     }
9308
9309   /* Set the arch as well as we will need it when outputing
9310      the .arch directive in assembly.  */
9311   if (!selected_arch)
9312     {
9313       gcc_assert (selected_cpu);
9314       selected_arch = &all_architectures[selected_cpu->arch];
9315     }
9316
9317   if (!selected_tune)
9318     selected_tune = selected_cpu;
9319
9320 #ifndef HAVE_AS_MABI_OPTION
9321   /* The compiler may have been configured with 2.23.* binutils, which does
9322      not have support for ILP32.  */
9323   if (TARGET_ILP32)
9324     error ("Assembler does not support -mabi=ilp32");
9325 #endif
9326
9327   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9328     sorry ("Return address signing is only supported for -mabi=lp64");
9329
9330   /* Make sure we properly set up the explicit options.  */
9331   if ((aarch64_cpu_string && valid_cpu)
9332        || (aarch64_tune_string && valid_tune))
9333     gcc_assert (explicit_tune_core != aarch64_none);
9334
9335   if ((aarch64_cpu_string && valid_cpu)
9336        || (aarch64_arch_string && valid_arch))
9337     gcc_assert (explicit_arch != aarch64_no_arch);
9338
9339   aarch64_override_options_internal (&global_options);
9340
9341   /* Save these options as the default ones in case we push and pop them later
9342      while processing functions with potential target attributes.  */
9343   target_option_default_node = target_option_current_node
9344       = build_target_option_node (&global_options);
9345 }
9346
9347 /* Implement targetm.override_options_after_change.  */
9348
9349 static void
9350 aarch64_override_options_after_change (void)
9351 {
9352   aarch64_override_options_after_change_1 (&global_options);
9353 }
9354
9355 static struct machine_function *
9356 aarch64_init_machine_status (void)
9357 {
9358   struct machine_function *machine;
9359   machine = ggc_cleared_alloc<machine_function> ();
9360   return machine;
9361 }
9362
9363 void
9364 aarch64_init_expanders (void)
9365 {
9366   init_machine_status = aarch64_init_machine_status;
9367 }
9368
9369 /* A checking mechanism for the implementation of the various code models.  */
9370 static void
9371 initialize_aarch64_code_model (struct gcc_options *opts)
9372 {
9373    if (opts->x_flag_pic)
9374      {
9375        switch (opts->x_aarch64_cmodel_var)
9376          {
9377          case AARCH64_CMODEL_TINY:
9378            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9379            break;
9380          case AARCH64_CMODEL_SMALL:
9381 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9382            aarch64_cmodel = (flag_pic == 2
9383                              ? AARCH64_CMODEL_SMALL_PIC
9384                              : AARCH64_CMODEL_SMALL_SPIC);
9385 #else
9386            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9387 #endif
9388            break;
9389          case AARCH64_CMODEL_LARGE:
9390            sorry ("code model %qs with -f%s", "large",
9391                   opts->x_flag_pic > 1 ? "PIC" : "pic");
9392            break;
9393          default:
9394            gcc_unreachable ();
9395          }
9396      }
9397    else
9398      aarch64_cmodel = opts->x_aarch64_cmodel_var;
9399 }
9400
9401 /* Implement TARGET_OPTION_SAVE.  */
9402
9403 static void
9404 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9405 {
9406   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9407 }
9408
9409 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
9410    using the information saved in PTR.  */
9411
9412 static void
9413 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9414 {
9415   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9416   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9417   opts->x_explicit_arch = ptr->x_explicit_arch;
9418   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9419   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9420
9421   aarch64_override_options_internal (opts);
9422 }
9423
9424 /* Implement TARGET_OPTION_PRINT.  */
9425
9426 static void
9427 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9428 {
9429   const struct processor *cpu
9430     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9431   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9432   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9433   std::string extension
9434     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9435
9436   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9437   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9438            arch->name, extension.c_str ());
9439 }
9440
9441 static GTY(()) tree aarch64_previous_fndecl;
9442
9443 void
9444 aarch64_reset_previous_fndecl (void)
9445 {
9446   aarch64_previous_fndecl = NULL;
9447 }
9448
9449 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9450    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9451    make sure optab availability predicates are recomputed when necessary.  */
9452
9453 void
9454 aarch64_save_restore_target_globals (tree new_tree)
9455 {
9456   if (TREE_TARGET_GLOBALS (new_tree))
9457     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9458   else if (new_tree == target_option_default_node)
9459     restore_target_globals (&default_target_globals);
9460   else
9461     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9462 }
9463
9464 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9465    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9466    of the function, if such exists.  This function may be called multiple
9467    times on a single function so use aarch64_previous_fndecl to avoid
9468    setting up identical state.  */
9469
9470 static void
9471 aarch64_set_current_function (tree fndecl)
9472 {
9473   if (!fndecl || fndecl == aarch64_previous_fndecl)
9474     return;
9475
9476   tree old_tree = (aarch64_previous_fndecl
9477                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9478                    : NULL_TREE);
9479
9480   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9481
9482   /* If current function has no attributes but the previous one did,
9483      use the default node.  */
9484   if (!new_tree && old_tree)
9485     new_tree = target_option_default_node;
9486
9487   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9488      the default have been handled by aarch64_save_restore_target_globals from
9489      aarch64_pragma_target_parse.  */
9490   if (old_tree == new_tree)
9491     return;
9492
9493   aarch64_previous_fndecl = fndecl;
9494
9495   /* First set the target options.  */
9496   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9497
9498   aarch64_save_restore_target_globals (new_tree);
9499 }
9500
9501 /* Enum describing the various ways we can handle attributes.
9502    In many cases we can reuse the generic option handling machinery.  */
9503
9504 enum aarch64_attr_opt_type
9505 {
9506   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9507   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9508   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9509   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9510 };
9511
9512 /* All the information needed to handle a target attribute.
9513    NAME is the name of the attribute.
9514    ATTR_TYPE specifies the type of behavior of the attribute as described
9515    in the definition of enum aarch64_attr_opt_type.
9516    ALLOW_NEG is true if the attribute supports a "no-" form.
9517    HANDLER is the function that takes the attribute string and whether
9518    it is a pragma or attribute and handles the option.  It is needed only
9519    when the ATTR_TYPE is aarch64_attr_custom.
9520    OPT_NUM is the enum specifying the option that the attribute modifies.
9521    This is needed for attributes that mirror the behavior of a command-line
9522    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9523    aarch64_attr_enum.  */
9524
9525 struct aarch64_attribute_info
9526 {
9527   const char *name;
9528   enum aarch64_attr_opt_type attr_type;
9529   bool allow_neg;
9530   bool (*handler) (const char *, const char *);
9531   enum opt_code opt_num;
9532 };
9533
9534 /* Handle the ARCH_STR argument to the arch= target attribute.
9535    PRAGMA_OR_ATTR is used in potential error messages.  */
9536
9537 static bool
9538 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9539 {
9540   const struct processor *tmp_arch = NULL;
9541   enum aarch64_parse_opt_result parse_res
9542     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9543
9544   if (parse_res == AARCH64_PARSE_OK)
9545     {
9546       gcc_assert (tmp_arch);
9547       selected_arch = tmp_arch;
9548       explicit_arch = selected_arch->arch;
9549       return true;
9550     }
9551
9552   switch (parse_res)
9553     {
9554       case AARCH64_PARSE_MISSING_ARG:
9555         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9556         break;
9557       case AARCH64_PARSE_INVALID_ARG:
9558         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9559         aarch64_print_hint_for_arch (str);
9560         break;
9561       case AARCH64_PARSE_INVALID_FEATURE:
9562         error ("invalid feature modifier %qs for 'arch' target %s",
9563                str, pragma_or_attr);
9564         break;
9565       default:
9566         gcc_unreachable ();
9567     }
9568
9569   return false;
9570 }
9571
9572 /* Handle the argument CPU_STR to the cpu= target attribute.
9573    PRAGMA_OR_ATTR is used in potential error messages.  */
9574
9575 static bool
9576 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9577 {
9578   const struct processor *tmp_cpu = NULL;
9579   enum aarch64_parse_opt_result parse_res
9580     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9581
9582   if (parse_res == AARCH64_PARSE_OK)
9583     {
9584       gcc_assert (tmp_cpu);
9585       selected_tune = tmp_cpu;
9586       explicit_tune_core = selected_tune->ident;
9587
9588       selected_arch = &all_architectures[tmp_cpu->arch];
9589       explicit_arch = selected_arch->arch;
9590       return true;
9591     }
9592
9593   switch (parse_res)
9594     {
9595       case AARCH64_PARSE_MISSING_ARG:
9596         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9597         break;
9598       case AARCH64_PARSE_INVALID_ARG:
9599         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9600         aarch64_print_hint_for_core (str);
9601         break;
9602       case AARCH64_PARSE_INVALID_FEATURE:
9603         error ("invalid feature modifier %qs for 'cpu' target %s",
9604                str, pragma_or_attr);
9605         break;
9606       default:
9607         gcc_unreachable ();
9608     }
9609
9610   return false;
9611 }
9612
9613 /* Handle the argument STR to the tune= target attribute.
9614    PRAGMA_OR_ATTR is used in potential error messages.  */
9615
9616 static bool
9617 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9618 {
9619   const struct processor *tmp_tune = NULL;
9620   enum aarch64_parse_opt_result parse_res
9621     = aarch64_parse_tune (str, &tmp_tune);
9622
9623   if (parse_res == AARCH64_PARSE_OK)
9624     {
9625       gcc_assert (tmp_tune);
9626       selected_tune = tmp_tune;
9627       explicit_tune_core = selected_tune->ident;
9628       return true;
9629     }
9630
9631   switch (parse_res)
9632     {
9633       case AARCH64_PARSE_INVALID_ARG:
9634         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9635         aarch64_print_hint_for_core (str);
9636         break;
9637       default:
9638         gcc_unreachable ();
9639     }
9640
9641   return false;
9642 }
9643
9644 /* Parse an architecture extensions target attribute string specified in STR.
9645    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9646    if successful.  Update aarch64_isa_flags to reflect the ISA features
9647    modified.
9648    PRAGMA_OR_ATTR is used in potential error messages.  */
9649
9650 static bool
9651 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9652 {
9653   enum aarch64_parse_opt_result parse_res;
9654   unsigned long isa_flags = aarch64_isa_flags;
9655
9656   /* We allow "+nothing" in the beginning to clear out all architectural
9657      features if the user wants to handpick specific features.  */
9658   if (strncmp ("+nothing", str, 8) == 0)
9659     {
9660       isa_flags = 0;
9661       str += 8;
9662     }
9663
9664   parse_res = aarch64_parse_extension (str, &isa_flags);
9665
9666   if (parse_res == AARCH64_PARSE_OK)
9667     {
9668       aarch64_isa_flags = isa_flags;
9669       return true;
9670     }
9671
9672   switch (parse_res)
9673     {
9674       case AARCH64_PARSE_MISSING_ARG:
9675         error ("missing feature modifier in target %s %qs",
9676                pragma_or_attr, str);
9677         break;
9678
9679       case AARCH64_PARSE_INVALID_FEATURE:
9680         error ("invalid feature modifier in target %s %qs",
9681                pragma_or_attr, str);
9682         break;
9683
9684       default:
9685         gcc_unreachable ();
9686     }
9687
9688  return false;
9689 }
9690
9691 /* The target attributes that we support.  On top of these we also support just
9692    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9693    handled explicitly in aarch64_process_one_target_attr.  */
9694
9695 static const struct aarch64_attribute_info aarch64_attributes[] =
9696 {
9697   { "general-regs-only", aarch64_attr_mask, false, NULL,
9698      OPT_mgeneral_regs_only },
9699   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9700      OPT_mfix_cortex_a53_835769 },
9701   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9702      OPT_mfix_cortex_a53_843419 },
9703   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9704   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9705   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9706      OPT_momit_leaf_frame_pointer },
9707   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9708   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9709      OPT_march_ },
9710   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9711   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9712      OPT_mtune_ },
9713   { "sign-return-address", aarch64_attr_enum, false, NULL,
9714      OPT_msign_return_address_ },
9715   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9716 };
9717
9718 /* Parse ARG_STR which contains the definition of one target attribute.
9719    Show appropriate errors if any or return true if the attribute is valid.
9720    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9721    we're processing a target attribute or pragma.  */
9722
9723 static bool
9724 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9725 {
9726   bool invert = false;
9727
9728   size_t len = strlen (arg_str);
9729
9730   if (len == 0)
9731     {
9732       error ("malformed target %s", pragma_or_attr);
9733       return false;
9734     }
9735
9736   char *str_to_check = (char *) alloca (len + 1);
9737   strcpy (str_to_check, arg_str);
9738
9739   /* Skip leading whitespace.  */
9740   while (*str_to_check == ' ' || *str_to_check == '\t')
9741     str_to_check++;
9742
9743   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9744      It is easier to detect and handle it explicitly here rather than going
9745      through the machinery for the rest of the target attributes in this
9746      function.  */
9747   if (*str_to_check == '+')
9748     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9749
9750   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9751     {
9752       invert = true;
9753       str_to_check += 3;
9754     }
9755   char *arg = strchr (str_to_check, '=');
9756
9757   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9758      and point ARG to "foo".  */
9759   if (arg)
9760     {
9761       *arg = '\0';
9762       arg++;
9763     }
9764   const struct aarch64_attribute_info *p_attr;
9765   bool found = false;
9766   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9767     {
9768       /* If the names don't match up, or the user has given an argument
9769          to an attribute that doesn't accept one, or didn't give an argument
9770          to an attribute that expects one, fail to match.  */
9771       if (strcmp (str_to_check, p_attr->name) != 0)
9772         continue;
9773
9774       found = true;
9775       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9776                               || p_attr->attr_type == aarch64_attr_enum;
9777
9778       if (attr_need_arg_p ^ (arg != NULL))
9779         {
9780           error ("target %s %qs does not accept an argument",
9781                   pragma_or_attr, str_to_check);
9782           return false;
9783         }
9784
9785       /* If the name matches but the attribute does not allow "no-" versions
9786          then we can't match.  */
9787       if (invert && !p_attr->allow_neg)
9788         {
9789           error ("target %s %qs does not allow a negated form",
9790                   pragma_or_attr, str_to_check);
9791           return false;
9792         }
9793
9794       switch (p_attr->attr_type)
9795         {
9796         /* Has a custom handler registered.
9797            For example, cpu=, arch=, tune=.  */
9798           case aarch64_attr_custom:
9799             gcc_assert (p_attr->handler);
9800             if (!p_attr->handler (arg, pragma_or_attr))
9801               return false;
9802             break;
9803
9804           /* Either set or unset a boolean option.  */
9805           case aarch64_attr_bool:
9806             {
9807               struct cl_decoded_option decoded;
9808
9809               generate_option (p_attr->opt_num, NULL, !invert,
9810                                CL_TARGET, &decoded);
9811               aarch64_handle_option (&global_options, &global_options_set,
9812                                       &decoded, input_location);
9813               break;
9814             }
9815           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9816              should know what mask to apply given the option number.  */
9817           case aarch64_attr_mask:
9818             {
9819               struct cl_decoded_option decoded;
9820               /* We only need to specify the option number.
9821                  aarch64_handle_option will know which mask to apply.  */
9822               decoded.opt_index = p_attr->opt_num;
9823               decoded.value = !invert;
9824               aarch64_handle_option (&global_options, &global_options_set,
9825                                       &decoded, input_location);
9826               break;
9827             }
9828           /* Use the option setting machinery to set an option to an enum.  */
9829           case aarch64_attr_enum:
9830             {
9831               gcc_assert (arg);
9832               bool valid;
9833               int value;
9834               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9835                                               &value, CL_TARGET);
9836               if (valid)
9837                 {
9838                   set_option (&global_options, NULL, p_attr->opt_num, value,
9839                               NULL, DK_UNSPECIFIED, input_location,
9840                               global_dc);
9841                 }
9842               else
9843                 {
9844                   error ("target %s %s=%s is not valid",
9845                          pragma_or_attr, str_to_check, arg);
9846                 }
9847               break;
9848             }
9849           default:
9850             gcc_unreachable ();
9851         }
9852     }
9853
9854   /* If we reached here we either have found an attribute and validated
9855      it or didn't match any.  If we matched an attribute but its arguments
9856      were malformed we will have returned false already.  */
9857   return found;
9858 }
9859
9860 /* Count how many times the character C appears in
9861    NULL-terminated string STR.  */
9862
9863 static unsigned int
9864 num_occurences_in_str (char c, char *str)
9865 {
9866   unsigned int res = 0;
9867   while (*str != '\0')
9868     {
9869       if (*str == c)
9870         res++;
9871
9872       str++;
9873     }
9874
9875   return res;
9876 }
9877
9878 /* Parse the tree in ARGS that contains the target attribute information
9879    and update the global target options space.  PRAGMA_OR_ATTR is a string
9880    to be used in error messages, specifying whether this is processing
9881    a target attribute or a target pragma.  */
9882
9883 bool
9884 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9885 {
9886   if (TREE_CODE (args) == TREE_LIST)
9887     {
9888       do
9889         {
9890           tree head = TREE_VALUE (args);
9891           if (head)
9892             {
9893               if (!aarch64_process_target_attr (head, pragma_or_attr))
9894                 return false;
9895             }
9896           args = TREE_CHAIN (args);
9897         } while (args);
9898
9899       return true;
9900     }
9901
9902   if (TREE_CODE (args) != STRING_CST)
9903     {
9904       error ("attribute %<target%> argument not a string");
9905       return false;
9906     }
9907
9908   size_t len = strlen (TREE_STRING_POINTER (args));
9909   char *str_to_check = (char *) alloca (len + 1);
9910   strcpy (str_to_check, TREE_STRING_POINTER (args));
9911
9912   if (len == 0)
9913     {
9914       error ("malformed target %s value", pragma_or_attr);
9915       return false;
9916     }
9917
9918   /* Used to catch empty spaces between commas i.e.
9919      attribute ((target ("attr1,,attr2"))).  */
9920   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9921
9922   /* Handle multiple target attributes separated by ','.  */
9923   char *token = strtok (str_to_check, ",");
9924
9925   unsigned int num_attrs = 0;
9926   while (token)
9927     {
9928       num_attrs++;
9929       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9930         {
9931           error ("target %s %qs is invalid", pragma_or_attr, token);
9932           return false;
9933         }
9934
9935       token = strtok (NULL, ",");
9936     }
9937
9938   if (num_attrs != num_commas + 1)
9939     {
9940       error ("malformed target %s list %qs",
9941               pragma_or_attr, TREE_STRING_POINTER (args));
9942       return false;
9943     }
9944
9945   return true;
9946 }
9947
9948 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9949    process attribute ((target ("..."))).  */
9950
9951 static bool
9952 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9953 {
9954   struct cl_target_option cur_target;
9955   bool ret;
9956   tree old_optimize;
9957   tree new_target, new_optimize;
9958   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9959
9960   /* If what we're processing is the current pragma string then the
9961      target option node is already stored in target_option_current_node
9962      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9963      having to re-parse the string.  This is especially useful to keep
9964      arm_neon.h compile times down since that header contains a lot
9965      of intrinsics enclosed in pragmas.  */
9966   if (!existing_target && args == current_target_pragma)
9967     {
9968       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9969       return true;
9970     }
9971   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9972
9973   old_optimize = build_optimization_node (&global_options);
9974   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9975
9976   /* If the function changed the optimization levels as well as setting
9977      target options, start with the optimizations specified.  */
9978   if (func_optimize && func_optimize != old_optimize)
9979     cl_optimization_restore (&global_options,
9980                              TREE_OPTIMIZATION (func_optimize));
9981
9982   /* Save the current target options to restore at the end.  */
9983   cl_target_option_save (&cur_target, &global_options);
9984
9985   /* If fndecl already has some target attributes applied to it, unpack
9986      them so that we add this attribute on top of them, rather than
9987      overwriting them.  */
9988   if (existing_target)
9989     {
9990       struct cl_target_option *existing_options
9991         = TREE_TARGET_OPTION (existing_target);
9992
9993       if (existing_options)
9994         cl_target_option_restore (&global_options, existing_options);
9995     }
9996   else
9997     cl_target_option_restore (&global_options,
9998                         TREE_TARGET_OPTION (target_option_current_node));
9999
10000
10001   ret = aarch64_process_target_attr (args, "attribute");
10002
10003   /* Set up any additional state.  */
10004   if (ret)
10005     {
10006       aarch64_override_options_internal (&global_options);
10007       /* Initialize SIMD builtins if we haven't already.
10008          Set current_target_pragma to NULL for the duration so that
10009          the builtin initialization code doesn't try to tag the functions
10010          being built with the attributes specified by any current pragma, thus
10011          going into an infinite recursion.  */
10012       if (TARGET_SIMD)
10013         {
10014           tree saved_current_target_pragma = current_target_pragma;
10015           current_target_pragma = NULL;
10016           aarch64_init_simd_builtins ();
10017           current_target_pragma = saved_current_target_pragma;
10018         }
10019       new_target = build_target_option_node (&global_options);
10020     }
10021   else
10022     new_target = NULL;
10023
10024   new_optimize = build_optimization_node (&global_options);
10025
10026   if (fndecl && ret)
10027     {
10028       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
10029
10030       if (old_optimize != new_optimize)
10031         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
10032     }
10033
10034   cl_target_option_restore (&global_options, &cur_target);
10035
10036   if (old_optimize != new_optimize)
10037     cl_optimization_restore (&global_options,
10038                              TREE_OPTIMIZATION (old_optimize));
10039   return ret;
10040 }
10041
10042 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
10043    tri-bool options (yes, no, don't care) and the default value is
10044    DEF, determine whether to reject inlining.  */
10045
10046 static bool
10047 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
10048                                      int dont_care, int def)
10049 {
10050   /* If the callee doesn't care, always allow inlining.  */
10051   if (callee == dont_care)
10052     return true;
10053
10054   /* If the caller doesn't care, always allow inlining.  */
10055   if (caller == dont_care)
10056     return true;
10057
10058   /* Otherwise, allow inlining if either the callee and caller values
10059      agree, or if the callee is using the default value.  */
10060   return (callee == caller || callee == def);
10061 }
10062
10063 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
10064    to inline CALLEE into CALLER based on target-specific info.
10065    Make sure that the caller and callee have compatible architectural
10066    features.  Then go through the other possible target attributes
10067    and see if they can block inlining.  Try not to reject always_inline
10068    callees unless they are incompatible architecturally.  */
10069
10070 static bool
10071 aarch64_can_inline_p (tree caller, tree callee)
10072 {
10073   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
10074   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
10075
10076   /* If callee has no option attributes, then it is ok to inline.  */
10077   if (!callee_tree)
10078     return true;
10079
10080   struct cl_target_option *caller_opts
10081         = TREE_TARGET_OPTION (caller_tree ? caller_tree
10082                                            : target_option_default_node);
10083
10084   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
10085
10086
10087   /* Callee's ISA flags should be a subset of the caller's.  */
10088   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
10089        != callee_opts->x_aarch64_isa_flags)
10090     return false;
10091
10092   /* Allow non-strict aligned functions inlining into strict
10093      aligned ones.  */
10094   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
10095        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
10096       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
10097            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
10098     return false;
10099
10100   bool always_inline = lookup_attribute ("always_inline",
10101                                           DECL_ATTRIBUTES (callee));
10102
10103   /* If the architectural features match up and the callee is always_inline
10104      then the other attributes don't matter.  */
10105   if (always_inline)
10106     return true;
10107
10108   if (caller_opts->x_aarch64_cmodel_var
10109       != callee_opts->x_aarch64_cmodel_var)
10110     return false;
10111
10112   if (caller_opts->x_aarch64_tls_dialect
10113       != callee_opts->x_aarch64_tls_dialect)
10114     return false;
10115
10116   /* Honour explicit requests to workaround errata.  */
10117   if (!aarch64_tribools_ok_for_inlining_p (
10118           caller_opts->x_aarch64_fix_a53_err835769,
10119           callee_opts->x_aarch64_fix_a53_err835769,
10120           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
10121     return false;
10122
10123   if (!aarch64_tribools_ok_for_inlining_p (
10124           caller_opts->x_aarch64_fix_a53_err843419,
10125           callee_opts->x_aarch64_fix_a53_err843419,
10126           2, TARGET_FIX_ERR_A53_843419))
10127     return false;
10128
10129   /* If the user explicitly specified -momit-leaf-frame-pointer for the
10130      caller and calle and they don't match up, reject inlining.  */
10131   if (!aarch64_tribools_ok_for_inlining_p (
10132           caller_opts->x_flag_omit_leaf_frame_pointer,
10133           callee_opts->x_flag_omit_leaf_frame_pointer,
10134           2, 1))
10135     return false;
10136
10137   /* If the callee has specific tuning overrides, respect them.  */
10138   if (callee_opts->x_aarch64_override_tune_string != NULL
10139       && caller_opts->x_aarch64_override_tune_string == NULL)
10140     return false;
10141
10142   /* If the user specified tuning override strings for the
10143      caller and callee and they don't match up, reject inlining.
10144      We just do a string compare here, we don't analyze the meaning
10145      of the string, as it would be too costly for little gain.  */
10146   if (callee_opts->x_aarch64_override_tune_string
10147       && caller_opts->x_aarch64_override_tune_string
10148       && (strcmp (callee_opts->x_aarch64_override_tune_string,
10149                   caller_opts->x_aarch64_override_tune_string) != 0))
10150     return false;
10151
10152   return true;
10153 }
10154
10155 /* Return true if SYMBOL_REF X binds locally.  */
10156
10157 static bool
10158 aarch64_symbol_binds_local_p (const_rtx x)
10159 {
10160   return (SYMBOL_REF_DECL (x)
10161           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
10162           : SYMBOL_REF_LOCAL_P (x));
10163 }
10164
10165 /* Return true if SYMBOL_REF X is thread local */
10166 static bool
10167 aarch64_tls_symbol_p (rtx x)
10168 {
10169   if (! TARGET_HAVE_TLS)
10170     return false;
10171
10172   if (GET_CODE (x) != SYMBOL_REF)
10173     return false;
10174
10175   return SYMBOL_REF_TLS_MODEL (x) != 0;
10176 }
10177
10178 /* Classify a TLS symbol into one of the TLS kinds.  */
10179 enum aarch64_symbol_type
10180 aarch64_classify_tls_symbol (rtx x)
10181 {
10182   enum tls_model tls_kind = tls_symbolic_operand_type (x);
10183
10184   switch (tls_kind)
10185     {
10186     case TLS_MODEL_GLOBAL_DYNAMIC:
10187     case TLS_MODEL_LOCAL_DYNAMIC:
10188       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
10189
10190     case TLS_MODEL_INITIAL_EXEC:
10191       switch (aarch64_cmodel)
10192         {
10193         case AARCH64_CMODEL_TINY:
10194         case AARCH64_CMODEL_TINY_PIC:
10195           return SYMBOL_TINY_TLSIE;
10196         default:
10197           return SYMBOL_SMALL_TLSIE;
10198         }
10199
10200     case TLS_MODEL_LOCAL_EXEC:
10201       if (aarch64_tls_size == 12)
10202         return SYMBOL_TLSLE12;
10203       else if (aarch64_tls_size == 24)
10204         return SYMBOL_TLSLE24;
10205       else if (aarch64_tls_size == 32)
10206         return SYMBOL_TLSLE32;
10207       else if (aarch64_tls_size == 48)
10208         return SYMBOL_TLSLE48;
10209       else
10210         gcc_unreachable ();
10211
10212     case TLS_MODEL_EMULATED:
10213     case TLS_MODEL_NONE:
10214       return SYMBOL_FORCE_TO_MEM;
10215
10216     default:
10217       gcc_unreachable ();
10218     }
10219 }
10220
10221 /* Return the method that should be used to access SYMBOL_REF or
10222    LABEL_REF X.  */
10223
10224 enum aarch64_symbol_type
10225 aarch64_classify_symbol (rtx x, rtx offset)
10226 {
10227   if (GET_CODE (x) == LABEL_REF)
10228     {
10229       switch (aarch64_cmodel)
10230         {
10231         case AARCH64_CMODEL_LARGE:
10232           return SYMBOL_FORCE_TO_MEM;
10233
10234         case AARCH64_CMODEL_TINY_PIC:
10235         case AARCH64_CMODEL_TINY:
10236           return SYMBOL_TINY_ABSOLUTE;
10237
10238         case AARCH64_CMODEL_SMALL_SPIC:
10239         case AARCH64_CMODEL_SMALL_PIC:
10240         case AARCH64_CMODEL_SMALL:
10241           return SYMBOL_SMALL_ABSOLUTE;
10242
10243         default:
10244           gcc_unreachable ();
10245         }
10246     }
10247
10248   if (GET_CODE (x) == SYMBOL_REF)
10249     {
10250       if (aarch64_tls_symbol_p (x))
10251         return aarch64_classify_tls_symbol (x);
10252
10253       switch (aarch64_cmodel)
10254         {
10255         case AARCH64_CMODEL_TINY:
10256           /* When we retrieve symbol + offset address, we have to make sure
10257              the offset does not cause overflow of the final address.  But
10258              we have no way of knowing the address of symbol at compile time
10259              so we can't accurately say if the distance between the PC and
10260              symbol + offset is outside the addressible range of +/-1M in the
10261              TINY code model.  So we rely on images not being greater than
10262              1M and cap the offset at 1M and anything beyond 1M will have to
10263              be loaded using an alternative mechanism.  Furthermore if the
10264              symbol is a weak reference to something that isn't known to
10265              resolve to a symbol in this module, then force to memory.  */
10266           if ((SYMBOL_REF_WEAK (x)
10267                && !aarch64_symbol_binds_local_p (x))
10268               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
10269             return SYMBOL_FORCE_TO_MEM;
10270           return SYMBOL_TINY_ABSOLUTE;
10271
10272         case AARCH64_CMODEL_SMALL:
10273           /* Same reasoning as the tiny code model, but the offset cap here is
10274              4G.  */
10275           if ((SYMBOL_REF_WEAK (x)
10276                && !aarch64_symbol_binds_local_p (x))
10277               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
10278                             HOST_WIDE_INT_C (4294967264)))
10279             return SYMBOL_FORCE_TO_MEM;
10280           return SYMBOL_SMALL_ABSOLUTE;
10281
10282         case AARCH64_CMODEL_TINY_PIC:
10283           if (!aarch64_symbol_binds_local_p (x))
10284             return SYMBOL_TINY_GOT;
10285           return SYMBOL_TINY_ABSOLUTE;
10286
10287         case AARCH64_CMODEL_SMALL_SPIC:
10288         case AARCH64_CMODEL_SMALL_PIC:
10289           if (!aarch64_symbol_binds_local_p (x))
10290             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
10291                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
10292           return SYMBOL_SMALL_ABSOLUTE;
10293
10294         case AARCH64_CMODEL_LARGE:
10295           /* This is alright even in PIC code as the constant
10296              pool reference is always PC relative and within
10297              the same translation unit.  */
10298           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
10299             return SYMBOL_SMALL_ABSOLUTE;
10300           else
10301             return SYMBOL_FORCE_TO_MEM;
10302
10303         default:
10304           gcc_unreachable ();
10305         }
10306     }
10307
10308   /* By default push everything into the constant pool.  */
10309   return SYMBOL_FORCE_TO_MEM;
10310 }
10311
10312 bool
10313 aarch64_constant_address_p (rtx x)
10314 {
10315   return (CONSTANT_P (x) && memory_address_p (DImode, x));
10316 }
10317
10318 bool
10319 aarch64_legitimate_pic_operand_p (rtx x)
10320 {
10321   if (GET_CODE (x) == SYMBOL_REF
10322       || (GET_CODE (x) == CONST
10323           && GET_CODE (XEXP (x, 0)) == PLUS
10324           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10325      return false;
10326
10327   return true;
10328 }
10329
10330 /* Return true if X holds either a quarter-precision or
10331      floating-point +0.0 constant.  */
10332 static bool
10333 aarch64_valid_floating_const (rtx x)
10334 {
10335   if (!CONST_DOUBLE_P (x))
10336     return false;
10337
10338   /* This call determines which constants can be used in mov<mode>
10339      as integer moves instead of constant loads.  */
10340   if (aarch64_float_const_rtx_p (x))
10341     return true;
10342
10343   return aarch64_float_const_representable_p (x);
10344 }
10345
10346 static bool
10347 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10348 {
10349   /* Do not allow vector struct mode constants.  We could support
10350      0 and -1 easily, but they need support in aarch64-simd.md.  */
10351   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10352     return false;
10353
10354   /* For these cases we never want to use a literal load.
10355      As such we have to prevent the compiler from forcing these
10356      to memory.  */
10357   if ((GET_CODE (x) == CONST_VECTOR
10358        && aarch64_simd_valid_immediate (x, mode, false, NULL))
10359       || CONST_INT_P (x)
10360       || aarch64_valid_floating_const (x)
10361       || aarch64_can_const_movi_rtx_p (x, mode)
10362       || aarch64_float_const_rtx_p (x))
10363         return !targetm.cannot_force_const_mem (mode, x);
10364
10365   if (GET_CODE (x) == HIGH
10366       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10367     return true;
10368
10369   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
10370      so spilling them is better than rematerialization.  */
10371   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
10372     return true;
10373
10374   return aarch64_constant_address_p (x);
10375 }
10376
10377 rtx
10378 aarch64_load_tp (rtx target)
10379 {
10380   if (!target
10381       || GET_MODE (target) != Pmode
10382       || !register_operand (target, Pmode))
10383     target = gen_reg_rtx (Pmode);
10384
10385   /* Can return in any reg.  */
10386   emit_insn (gen_aarch64_load_tp_hard (target));
10387   return target;
10388 }
10389
10390 /* On AAPCS systems, this is the "struct __va_list".  */
10391 static GTY(()) tree va_list_type;
10392
10393 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10394    Return the type to use as __builtin_va_list.
10395
10396    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10397
10398    struct __va_list
10399    {
10400      void *__stack;
10401      void *__gr_top;
10402      void *__vr_top;
10403      int   __gr_offs;
10404      int   __vr_offs;
10405    };  */
10406
10407 static tree
10408 aarch64_build_builtin_va_list (void)
10409 {
10410   tree va_list_name;
10411   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10412
10413   /* Create the type.  */
10414   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10415   /* Give it the required name.  */
10416   va_list_name = build_decl (BUILTINS_LOCATION,
10417                              TYPE_DECL,
10418                              get_identifier ("__va_list"),
10419                              va_list_type);
10420   DECL_ARTIFICIAL (va_list_name) = 1;
10421   TYPE_NAME (va_list_type) = va_list_name;
10422   TYPE_STUB_DECL (va_list_type) = va_list_name;
10423
10424   /* Create the fields.  */
10425   f_stack = build_decl (BUILTINS_LOCATION,
10426                         FIELD_DECL, get_identifier ("__stack"),
10427                         ptr_type_node);
10428   f_grtop = build_decl (BUILTINS_LOCATION,
10429                         FIELD_DECL, get_identifier ("__gr_top"),
10430                         ptr_type_node);
10431   f_vrtop = build_decl (BUILTINS_LOCATION,
10432                         FIELD_DECL, get_identifier ("__vr_top"),
10433                         ptr_type_node);
10434   f_groff = build_decl (BUILTINS_LOCATION,
10435                         FIELD_DECL, get_identifier ("__gr_offs"),
10436                         integer_type_node);
10437   f_vroff = build_decl (BUILTINS_LOCATION,
10438                         FIELD_DECL, get_identifier ("__vr_offs"),
10439                         integer_type_node);
10440
10441   /* Tell tree-stdarg pass about our internal offset fields.
10442      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10443      purpose to identify whether the code is updating va_list internal
10444      offset fields through irregular way.  */
10445   va_list_gpr_counter_field = f_groff;
10446   va_list_fpr_counter_field = f_vroff;
10447
10448   DECL_ARTIFICIAL (f_stack) = 1;
10449   DECL_ARTIFICIAL (f_grtop) = 1;
10450   DECL_ARTIFICIAL (f_vrtop) = 1;
10451   DECL_ARTIFICIAL (f_groff) = 1;
10452   DECL_ARTIFICIAL (f_vroff) = 1;
10453
10454   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10455   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10456   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10457   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10458   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10459
10460   TYPE_FIELDS (va_list_type) = f_stack;
10461   DECL_CHAIN (f_stack) = f_grtop;
10462   DECL_CHAIN (f_grtop) = f_vrtop;
10463   DECL_CHAIN (f_vrtop) = f_groff;
10464   DECL_CHAIN (f_groff) = f_vroff;
10465
10466   /* Compute its layout.  */
10467   layout_type (va_list_type);
10468
10469   return va_list_type;
10470 }
10471
10472 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10473 static void
10474 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10475 {
10476   const CUMULATIVE_ARGS *cum;
10477   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10478   tree stack, grtop, vrtop, groff, vroff;
10479   tree t;
10480   int gr_save_area_size = cfun->va_list_gpr_size;
10481   int vr_save_area_size = cfun->va_list_fpr_size;
10482   int vr_offset;
10483
10484   cum = &crtl->args.info;
10485   if (cfun->va_list_gpr_size)
10486     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10487                              cfun->va_list_gpr_size);
10488   if (cfun->va_list_fpr_size)
10489     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10490                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10491
10492   if (!TARGET_FLOAT)
10493     {
10494       gcc_assert (cum->aapcs_nvrn == 0);
10495       vr_save_area_size = 0;
10496     }
10497
10498   f_stack = TYPE_FIELDS (va_list_type_node);
10499   f_grtop = DECL_CHAIN (f_stack);
10500   f_vrtop = DECL_CHAIN (f_grtop);
10501   f_groff = DECL_CHAIN (f_vrtop);
10502   f_vroff = DECL_CHAIN (f_groff);
10503
10504   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10505                   NULL_TREE);
10506   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10507                   NULL_TREE);
10508   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10509                   NULL_TREE);
10510   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10511                   NULL_TREE);
10512   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10513                   NULL_TREE);
10514
10515   /* Emit code to initialize STACK, which points to the next varargs stack
10516      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10517      by named arguments.  STACK is 8-byte aligned.  */
10518   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10519   if (cum->aapcs_stack_size > 0)
10520     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10521   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10522   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10523
10524   /* Emit code to initialize GRTOP, the top of the GR save area.
10525      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10526   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10527   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10528   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10529
10530   /* Emit code to initialize VRTOP, the top of the VR save area.
10531      This address is gr_save_area_bytes below GRTOP, rounded
10532      down to the next 16-byte boundary.  */
10533   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10534   vr_offset = ROUND_UP (gr_save_area_size,
10535                         STACK_BOUNDARY / BITS_PER_UNIT);
10536
10537   if (vr_offset)
10538     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10539   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10540   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10541
10542   /* Emit code to initialize GROFF, the offset from GRTOP of the
10543      next GPR argument.  */
10544   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10545               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10546   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10547
10548   /* Likewise emit code to initialize VROFF, the offset from FTOP
10549      of the next VR argument.  */
10550   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10551               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10552   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10553 }
10554
10555 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10556
10557 static tree
10558 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10559                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10560 {
10561   tree addr;
10562   bool indirect_p;
10563   bool is_ha;           /* is HFA or HVA.  */
10564   bool dw_align;        /* double-word align.  */
10565   machine_mode ag_mode = VOIDmode;
10566   int nregs;
10567   machine_mode mode;
10568
10569   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10570   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10571   HOST_WIDE_INT size, rsize, adjust, align;
10572   tree t, u, cond1, cond2;
10573
10574   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10575   if (indirect_p)
10576     type = build_pointer_type (type);
10577
10578   mode = TYPE_MODE (type);
10579
10580   f_stack = TYPE_FIELDS (va_list_type_node);
10581   f_grtop = DECL_CHAIN (f_stack);
10582   f_vrtop = DECL_CHAIN (f_grtop);
10583   f_groff = DECL_CHAIN (f_vrtop);
10584   f_vroff = DECL_CHAIN (f_groff);
10585
10586   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10587                   f_stack, NULL_TREE);
10588   size = int_size_in_bytes (type);
10589   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10590
10591   dw_align = false;
10592   adjust = 0;
10593   if (aarch64_vfp_is_call_or_return_candidate (mode,
10594                                                type,
10595                                                &ag_mode,
10596                                                &nregs,
10597                                                &is_ha))
10598     {
10599       /* TYPE passed in fp/simd registers.  */
10600       if (!TARGET_FLOAT)
10601         aarch64_err_no_fpadvsimd (mode, "varargs");
10602
10603       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10604                       unshare_expr (valist), f_vrtop, NULL_TREE);
10605       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10606                       unshare_expr (valist), f_vroff, NULL_TREE);
10607
10608       rsize = nregs * UNITS_PER_VREG;
10609
10610       if (is_ha)
10611         {
10612           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10613             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10614         }
10615       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10616                && size < UNITS_PER_VREG)
10617         {
10618           adjust = UNITS_PER_VREG - size;
10619         }
10620     }
10621   else
10622     {
10623       /* TYPE passed in general registers.  */
10624       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10625                       unshare_expr (valist), f_grtop, NULL_TREE);
10626       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10627                       unshare_expr (valist), f_groff, NULL_TREE);
10628       rsize = ROUND_UP (size, UNITS_PER_WORD);
10629       nregs = rsize / UNITS_PER_WORD;
10630
10631       if (align > 8)
10632         dw_align = true;
10633
10634       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10635           && size < UNITS_PER_WORD)
10636         {
10637           adjust = UNITS_PER_WORD  - size;
10638         }
10639     }
10640
10641   /* Get a local temporary for the field value.  */
10642   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10643
10644   /* Emit code to branch if off >= 0.  */
10645   t = build2 (GE_EXPR, boolean_type_node, off,
10646               build_int_cst (TREE_TYPE (off), 0));
10647   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10648
10649   if (dw_align)
10650     {
10651       /* Emit: offs = (offs + 15) & -16.  */
10652       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10653                   build_int_cst (TREE_TYPE (off), 15));
10654       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10655                   build_int_cst (TREE_TYPE (off), -16));
10656       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10657     }
10658   else
10659     roundup = NULL;
10660
10661   /* Update ap.__[g|v]r_offs  */
10662   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10663               build_int_cst (TREE_TYPE (off), rsize));
10664   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10665
10666   /* String up.  */
10667   if (roundup)
10668     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10669
10670   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10671   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10672               build_int_cst (TREE_TYPE (f_off), 0));
10673   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10674
10675   /* String up: make sure the assignment happens before the use.  */
10676   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10677   COND_EXPR_ELSE (cond1) = t;
10678
10679   /* Prepare the trees handling the argument that is passed on the stack;
10680      the top level node will store in ON_STACK.  */
10681   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10682   if (align > 8)
10683     {
10684       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10685       t = fold_convert (intDI_type_node, arg);
10686       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10687                   build_int_cst (TREE_TYPE (t), 15));
10688       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10689                   build_int_cst (TREE_TYPE (t), -16));
10690       t = fold_convert (TREE_TYPE (arg), t);
10691       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10692     }
10693   else
10694     roundup = NULL;
10695   /* Advance ap.__stack  */
10696   t = fold_convert (intDI_type_node, arg);
10697   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10698               build_int_cst (TREE_TYPE (t), size + 7));
10699   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10700               build_int_cst (TREE_TYPE (t), -8));
10701   t = fold_convert (TREE_TYPE (arg), t);
10702   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10703   /* String up roundup and advance.  */
10704   if (roundup)
10705     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10706   /* String up with arg */
10707   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10708   /* Big-endianness related address adjustment.  */
10709   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10710       && size < UNITS_PER_WORD)
10711   {
10712     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10713                 size_int (UNITS_PER_WORD - size));
10714     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10715   }
10716
10717   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10718   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10719
10720   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10721   t = off;
10722   if (adjust)
10723     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10724                 build_int_cst (TREE_TYPE (off), adjust));
10725
10726   t = fold_convert (sizetype, t);
10727   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10728
10729   if (is_ha)
10730     {
10731       /* type ha; // treat as "struct {ftype field[n];}"
10732          ... [computing offs]
10733          for (i = 0; i <nregs; ++i, offs += 16)
10734            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10735          return ha;  */
10736       int i;
10737       tree tmp_ha, field_t, field_ptr_t;
10738
10739       /* Declare a local variable.  */
10740       tmp_ha = create_tmp_var_raw (type, "ha");
10741       gimple_add_tmp_var (tmp_ha);
10742
10743       /* Establish the base type.  */
10744       switch (ag_mode)
10745         {
10746         case E_SFmode:
10747           field_t = float_type_node;
10748           field_ptr_t = float_ptr_type_node;
10749           break;
10750         case E_DFmode:
10751           field_t = double_type_node;
10752           field_ptr_t = double_ptr_type_node;
10753           break;
10754         case E_TFmode:
10755           field_t = long_double_type_node;
10756           field_ptr_t = long_double_ptr_type_node;
10757           break;
10758         case E_HFmode:
10759           field_t = aarch64_fp16_type_node;
10760           field_ptr_t = aarch64_fp16_ptr_type_node;
10761           break;
10762         case E_V2SImode:
10763         case E_V4SImode:
10764             {
10765               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10766               field_t = build_vector_type_for_mode (innertype, ag_mode);
10767               field_ptr_t = build_pointer_type (field_t);
10768             }
10769           break;
10770         default:
10771           gcc_assert (0);
10772         }
10773
10774       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10775       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10776       addr = t;
10777       t = fold_convert (field_ptr_t, addr);
10778       t = build2 (MODIFY_EXPR, field_t,
10779                   build1 (INDIRECT_REF, field_t, tmp_ha),
10780                   build1 (INDIRECT_REF, field_t, t));
10781
10782       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10783       for (i = 1; i < nregs; ++i)
10784         {
10785           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10786           u = fold_convert (field_ptr_t, addr);
10787           u = build2 (MODIFY_EXPR, field_t,
10788                       build2 (MEM_REF, field_t, tmp_ha,
10789                               build_int_cst (field_ptr_t,
10790                                              (i *
10791                                               int_size_in_bytes (field_t)))),
10792                       build1 (INDIRECT_REF, field_t, u));
10793           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10794         }
10795
10796       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10797       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10798     }
10799
10800   COND_EXPR_ELSE (cond2) = t;
10801   addr = fold_convert (build_pointer_type (type), cond1);
10802   addr = build_va_arg_indirect_ref (addr);
10803
10804   if (indirect_p)
10805     addr = build_va_arg_indirect_ref (addr);
10806
10807   return addr;
10808 }
10809
10810 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10811
10812 static void
10813 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10814                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10815                                 int no_rtl)
10816 {
10817   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10818   CUMULATIVE_ARGS local_cum;
10819   int gr_saved = cfun->va_list_gpr_size;
10820   int vr_saved = cfun->va_list_fpr_size;
10821
10822   /* The caller has advanced CUM up to, but not beyond, the last named
10823      argument.  Advance a local copy of CUM past the last "real" named
10824      argument, to find out how many registers are left over.  */
10825   local_cum = *cum;
10826   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10827
10828   /* Found out how many registers we need to save.
10829      Honor tree-stdvar analysis results.  */
10830   if (cfun->va_list_gpr_size)
10831     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10832                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10833   if (cfun->va_list_fpr_size)
10834     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10835                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10836
10837   if (!TARGET_FLOAT)
10838     {
10839       gcc_assert (local_cum.aapcs_nvrn == 0);
10840       vr_saved = 0;
10841     }
10842
10843   if (!no_rtl)
10844     {
10845       if (gr_saved > 0)
10846         {
10847           rtx ptr, mem;
10848
10849           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10850           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10851                                - gr_saved * UNITS_PER_WORD);
10852           mem = gen_frame_mem (BLKmode, ptr);
10853           set_mem_alias_set (mem, get_varargs_alias_set ());
10854
10855           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10856                                mem, gr_saved);
10857         }
10858       if (vr_saved > 0)
10859         {
10860           /* We can't use move_block_from_reg, because it will use
10861              the wrong mode, storing D regs only.  */
10862           machine_mode mode = TImode;
10863           int off, i, vr_start;
10864
10865           /* Set OFF to the offset from virtual_incoming_args_rtx of
10866              the first vector register.  The VR save area lies below
10867              the GR one, and is aligned to 16 bytes.  */
10868           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10869                            STACK_BOUNDARY / BITS_PER_UNIT);
10870           off -= vr_saved * UNITS_PER_VREG;
10871
10872           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10873           for (i = 0; i < vr_saved; ++i)
10874             {
10875               rtx ptr, mem;
10876
10877               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10878               mem = gen_frame_mem (mode, ptr);
10879               set_mem_alias_set (mem, get_varargs_alias_set ());
10880               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10881               off += UNITS_PER_VREG;
10882             }
10883         }
10884     }
10885
10886   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10887      any complication of having crtl->args.pretend_args_size changed.  */
10888   cfun->machine->frame.saved_varargs_size
10889     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10890                  STACK_BOUNDARY / BITS_PER_UNIT)
10891        + vr_saved * UNITS_PER_VREG);
10892 }
10893
10894 static void
10895 aarch64_conditional_register_usage (void)
10896 {
10897   int i;
10898   if (!TARGET_FLOAT)
10899     {
10900       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10901         {
10902           fixed_regs[i] = 1;
10903           call_used_regs[i] = 1;
10904         }
10905     }
10906 }
10907
10908 /* Walk down the type tree of TYPE counting consecutive base elements.
10909    If *MODEP is VOIDmode, then set it to the first valid floating point
10910    type.  If a non-floating point type is found, or if a floating point
10911    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10912    otherwise return the count in the sub-tree.  */
10913 static int
10914 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10915 {
10916   machine_mode mode;
10917   HOST_WIDE_INT size;
10918
10919   switch (TREE_CODE (type))
10920     {
10921     case REAL_TYPE:
10922       mode = TYPE_MODE (type);
10923       if (mode != DFmode && mode != SFmode
10924           && mode != TFmode && mode != HFmode)
10925         return -1;
10926
10927       if (*modep == VOIDmode)
10928         *modep = mode;
10929
10930       if (*modep == mode)
10931         return 1;
10932
10933       break;
10934
10935     case COMPLEX_TYPE:
10936       mode = TYPE_MODE (TREE_TYPE (type));
10937       if (mode != DFmode && mode != SFmode
10938           && mode != TFmode && mode != HFmode)
10939         return -1;
10940
10941       if (*modep == VOIDmode)
10942         *modep = mode;
10943
10944       if (*modep == mode)
10945         return 2;
10946
10947       break;
10948
10949     case VECTOR_TYPE:
10950       /* Use V2SImode and V4SImode as representatives of all 64-bit
10951          and 128-bit vector types.  */
10952       size = int_size_in_bytes (type);
10953       switch (size)
10954         {
10955         case 8:
10956           mode = V2SImode;
10957           break;
10958         case 16:
10959           mode = V4SImode;
10960           break;
10961         default:
10962           return -1;
10963         }
10964
10965       if (*modep == VOIDmode)
10966         *modep = mode;
10967
10968       /* Vector modes are considered to be opaque: two vectors are
10969          equivalent for the purposes of being homogeneous aggregates
10970          if they are the same size.  */
10971       if (*modep == mode)
10972         return 1;
10973
10974       break;
10975
10976     case ARRAY_TYPE:
10977       {
10978         int count;
10979         tree index = TYPE_DOMAIN (type);
10980
10981         /* Can't handle incomplete types nor sizes that are not
10982            fixed.  */
10983         if (!COMPLETE_TYPE_P (type)
10984             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10985           return -1;
10986
10987         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10988         if (count == -1
10989             || !index
10990             || !TYPE_MAX_VALUE (index)
10991             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10992             || !TYPE_MIN_VALUE (index)
10993             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10994             || count < 0)
10995           return -1;
10996
10997         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10998                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10999
11000         /* There must be no padding.  */
11001         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11002           return -1;
11003
11004         return count;
11005       }
11006
11007     case RECORD_TYPE:
11008       {
11009         int count = 0;
11010         int sub_count;
11011         tree field;
11012
11013         /* Can't handle incomplete types nor sizes that are not
11014            fixed.  */
11015         if (!COMPLETE_TYPE_P (type)
11016             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11017           return -1;
11018
11019         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11020           {
11021             if (TREE_CODE (field) != FIELD_DECL)
11022               continue;
11023
11024             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11025             if (sub_count < 0)
11026               return -1;
11027             count += sub_count;
11028           }
11029
11030         /* There must be no padding.  */
11031         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11032           return -1;
11033
11034         return count;
11035       }
11036
11037     case UNION_TYPE:
11038     case QUAL_UNION_TYPE:
11039       {
11040         /* These aren't very interesting except in a degenerate case.  */
11041         int count = 0;
11042         int sub_count;
11043         tree field;
11044
11045         /* Can't handle incomplete types nor sizes that are not
11046            fixed.  */
11047         if (!COMPLETE_TYPE_P (type)
11048             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
11049           return -1;
11050
11051         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
11052           {
11053             if (TREE_CODE (field) != FIELD_DECL)
11054               continue;
11055
11056             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
11057             if (sub_count < 0)
11058               return -1;
11059             count = count > sub_count ? count : sub_count;
11060           }
11061
11062         /* There must be no padding.  */
11063         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
11064           return -1;
11065
11066         return count;
11067       }
11068
11069     default:
11070       break;
11071     }
11072
11073   return -1;
11074 }
11075
11076 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11077    type as described in AAPCS64 \S 4.1.2.
11078
11079    See the comment above aarch64_composite_type_p for the notes on MODE.  */
11080
11081 static bool
11082 aarch64_short_vector_p (const_tree type,
11083                         machine_mode mode)
11084 {
11085   HOST_WIDE_INT size = -1;
11086
11087   if (type && TREE_CODE (type) == VECTOR_TYPE)
11088     size = int_size_in_bytes (type);
11089   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
11090             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11091     size = GET_MODE_SIZE (mode);
11092
11093   return (size == 8 || size == 16);
11094 }
11095
11096 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11097    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
11098    array types.  The C99 floating-point complex types are also considered
11099    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
11100    types, which are GCC extensions and out of the scope of AAPCS64, are
11101    treated as composite types here as well.
11102
11103    Note that MODE itself is not sufficient in determining whether a type
11104    is such a composite type or not.  This is because
11105    stor-layout.c:compute_record_mode may have already changed the MODE
11106    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
11107    structure with only one field may have its MODE set to the mode of the
11108    field.  Also an integer mode whose size matches the size of the
11109    RECORD_TYPE type may be used to substitute the original mode
11110    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
11111    solely relied on.  */
11112
11113 static bool
11114 aarch64_composite_type_p (const_tree type,
11115                           machine_mode mode)
11116 {
11117   if (aarch64_short_vector_p (type, mode))
11118     return false;
11119
11120   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
11121     return true;
11122
11123   if (mode == BLKmode
11124       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
11125       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
11126     return true;
11127
11128   return false;
11129 }
11130
11131 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11132    shall be passed or returned in simd/fp register(s) (providing these
11133    parameter passing registers are available).
11134
11135    Upon successful return, *COUNT returns the number of needed registers,
11136    *BASE_MODE returns the mode of the individual register and when IS_HAF
11137    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11138    floating-point aggregate or a homogeneous short-vector aggregate.  */
11139
11140 static bool
11141 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
11142                                          const_tree type,
11143                                          machine_mode *base_mode,
11144                                          int *count,
11145                                          bool *is_ha)
11146 {
11147   machine_mode new_mode = VOIDmode;
11148   bool composite_p = aarch64_composite_type_p (type, mode);
11149
11150   if (is_ha != NULL) *is_ha = false;
11151
11152   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
11153       || aarch64_short_vector_p (type, mode))
11154     {
11155       *count = 1;
11156       new_mode = mode;
11157     }
11158   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
11159     {
11160       if (is_ha != NULL) *is_ha = true;
11161       *count = 2;
11162       new_mode = GET_MODE_INNER (mode);
11163     }
11164   else if (type && composite_p)
11165     {
11166       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
11167
11168       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
11169         {
11170           if (is_ha != NULL) *is_ha = true;
11171           *count = ag_count;
11172         }
11173       else
11174         return false;
11175     }
11176   else
11177     return false;
11178
11179   *base_mode = new_mode;
11180   return true;
11181 }
11182
11183 /* Implement TARGET_STRUCT_VALUE_RTX.  */
11184
11185 static rtx
11186 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
11187                           int incoming ATTRIBUTE_UNUSED)
11188 {
11189   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
11190 }
11191
11192 /* Implements target hook vector_mode_supported_p.  */
11193 static bool
11194 aarch64_vector_mode_supported_p (machine_mode mode)
11195 {
11196   if (TARGET_SIMD
11197       && (mode == V4SImode  || mode == V8HImode
11198           || mode == V16QImode || mode == V2DImode
11199           || mode == V2SImode  || mode == V4HImode
11200           || mode == V8QImode || mode == V2SFmode
11201           || mode == V4SFmode || mode == V2DFmode
11202           || mode == V4HFmode || mode == V8HFmode
11203           || mode == V1DFmode))
11204     return true;
11205
11206   return false;
11207 }
11208
11209 /* Return appropriate SIMD container
11210    for MODE within a vector of WIDTH bits.  */
11211 static machine_mode
11212 aarch64_simd_container_mode (machine_mode mode, unsigned width)
11213 {
11214   gcc_assert (width == 64 || width == 128);
11215   if (TARGET_SIMD)
11216     {
11217       if (width == 128)
11218         switch (mode)
11219           {
11220           case E_DFmode:
11221             return V2DFmode;
11222           case E_SFmode:
11223             return V4SFmode;
11224           case E_HFmode:
11225             return V8HFmode;
11226           case E_SImode:
11227             return V4SImode;
11228           case E_HImode:
11229             return V8HImode;
11230           case E_QImode:
11231             return V16QImode;
11232           case E_DImode:
11233             return V2DImode;
11234           default:
11235             break;
11236           }
11237       else
11238         switch (mode)
11239           {
11240           case E_SFmode:
11241             return V2SFmode;
11242           case E_HFmode:
11243             return V4HFmode;
11244           case E_SImode:
11245             return V2SImode;
11246           case E_HImode:
11247             return V4HImode;
11248           case E_QImode:
11249             return V8QImode;
11250           default:
11251             break;
11252           }
11253     }
11254   return word_mode;
11255 }
11256
11257 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
11258 static machine_mode
11259 aarch64_preferred_simd_mode (machine_mode mode)
11260 {
11261   return aarch64_simd_container_mode (mode, 128);
11262 }
11263
11264 /* Return the bitmask of possible vector sizes for the vectorizer
11265    to iterate over.  */
11266 static unsigned int
11267 aarch64_autovectorize_vector_sizes (void)
11268 {
11269   return (16 | 8);
11270 }
11271
11272 /* Implement TARGET_MANGLE_TYPE.  */
11273
11274 static const char *
11275 aarch64_mangle_type (const_tree type)
11276 {
11277   /* The AArch64 ABI documents say that "__va_list" has to be
11278      managled as if it is in the "std" namespace.  */
11279   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
11280     return "St9__va_list";
11281
11282   /* Half-precision float.  */
11283   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
11284     return "Dh";
11285
11286   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
11287      builtin types.  */
11288   if (TYPE_NAME (type) != NULL)
11289     return aarch64_mangle_builtin_type (type);
11290
11291   /* Use the default mangling.  */
11292   return NULL;
11293 }
11294
11295 /* Find the first rtx_insn before insn that will generate an assembly
11296    instruction.  */
11297
11298 static rtx_insn *
11299 aarch64_prev_real_insn (rtx_insn *insn)
11300 {
11301   if (!insn)
11302     return NULL;
11303
11304   do
11305     {
11306       insn = prev_real_insn (insn);
11307     }
11308   while (insn && recog_memoized (insn) < 0);
11309
11310   return insn;
11311 }
11312
11313 static bool
11314 is_madd_op (enum attr_type t1)
11315 {
11316   unsigned int i;
11317   /* A number of these may be AArch32 only.  */
11318   enum attr_type mlatypes[] = {
11319     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
11320     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
11321     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
11322   };
11323
11324   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11325     {
11326       if (t1 == mlatypes[i])
11327         return true;
11328     }
11329
11330   return false;
11331 }
11332
11333 /* Check if there is a register dependency between a load and the insn
11334    for which we hold recog_data.  */
11335
11336 static bool
11337 dep_between_memop_and_curr (rtx memop)
11338 {
11339   rtx load_reg;
11340   int opno;
11341
11342   gcc_assert (GET_CODE (memop) == SET);
11343
11344   if (!REG_P (SET_DEST (memop)))
11345     return false;
11346
11347   load_reg = SET_DEST (memop);
11348   for (opno = 1; opno < recog_data.n_operands; opno++)
11349     {
11350       rtx operand = recog_data.operand[opno];
11351       if (REG_P (operand)
11352           && reg_overlap_mentioned_p (load_reg, operand))
11353         return true;
11354
11355     }
11356   return false;
11357 }
11358
11359
11360 /* When working around the Cortex-A53 erratum 835769,
11361    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11362    instruction and has a preceding memory instruction such that a NOP
11363    should be inserted between them.  */
11364
11365 bool
11366 aarch64_madd_needs_nop (rtx_insn* insn)
11367 {
11368   enum attr_type attr_type;
11369   rtx_insn *prev;
11370   rtx body;
11371
11372   if (!TARGET_FIX_ERR_A53_835769)
11373     return false;
11374
11375   if (!INSN_P (insn) || recog_memoized (insn) < 0)
11376     return false;
11377
11378   attr_type = get_attr_type (insn);
11379   if (!is_madd_op (attr_type))
11380     return false;
11381
11382   prev = aarch64_prev_real_insn (insn);
11383   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11384      Restore recog state to INSN to avoid state corruption.  */
11385   extract_constrain_insn_cached (insn);
11386
11387   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11388     return false;
11389
11390   body = single_set (prev);
11391
11392   /* If the previous insn is a memory op and there is no dependency between
11393      it and the DImode madd, emit a NOP between them.  If body is NULL then we
11394      have a complex memory operation, probably a load/store pair.
11395      Be conservative for now and emit a NOP.  */
11396   if (GET_MODE (recog_data.operand[0]) == DImode
11397       && (!body || !dep_between_memop_and_curr (body)))
11398     return true;
11399
11400   return false;
11401
11402 }
11403
11404
11405 /* Implement FINAL_PRESCAN_INSN.  */
11406
11407 void
11408 aarch64_final_prescan_insn (rtx_insn *insn)
11409 {
11410   if (aarch64_madd_needs_nop (insn))
11411     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11412 }
11413
11414
11415 /* Return the equivalent letter for size.  */
11416 static char
11417 sizetochar (int size)
11418 {
11419   switch (size)
11420     {
11421     case 64: return 'd';
11422     case 32: return 's';
11423     case 16: return 'h';
11424     case 8 : return 'b';
11425     default: gcc_unreachable ();
11426     }
11427 }
11428
11429 /* Return true iff x is a uniform vector of floating-point
11430    constants, and the constant can be represented in
11431    quarter-precision form.  Note, as aarch64_float_const_representable
11432    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
11433 static bool
11434 aarch64_vect_float_const_representable_p (rtx x)
11435 {
11436   rtx elt;
11437   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11438           && const_vec_duplicate_p (x, &elt)
11439           && aarch64_float_const_representable_p (elt));
11440 }
11441
11442 /* Return true for valid and false for invalid.  */
11443 bool
11444 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11445                               struct simd_immediate_info *info)
11446 {
11447 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
11448   matches = 1;                                          \
11449   for (i = 0; i < idx; i += (STRIDE))                   \
11450     if (!(TEST))                                        \
11451       matches = 0;                                      \
11452   if (matches)                                          \
11453     {                                                   \
11454       immtype = (CLASS);                                \
11455       elsize = (ELSIZE);                                \
11456       eshift = (SHIFT);                                 \
11457       emvn = (NEG);                                     \
11458       break;                                            \
11459     }
11460
11461   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11462   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11463   unsigned char bytes[16];
11464   int immtype = -1, matches;
11465   unsigned int invmask = inverse ? 0xff : 0;
11466   int eshift, emvn;
11467
11468   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11469     {
11470       if (! (aarch64_simd_imm_zero_p (op, mode)
11471              || aarch64_vect_float_const_representable_p (op)))
11472         return false;
11473
11474       if (info)
11475         {
11476           info->value = CONST_VECTOR_ELT (op, 0);
11477           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11478           info->mvn = false;
11479           info->shift = 0;
11480         }
11481
11482       return true;
11483     }
11484
11485   /* Splat vector constant out into a byte vector.  */
11486   for (i = 0; i < n_elts; i++)
11487     {
11488       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11489          it must be laid out in the vector register in reverse order.  */
11490       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11491       unsigned HOST_WIDE_INT elpart;
11492
11493       gcc_assert (CONST_INT_P (el));
11494       elpart = INTVAL (el);
11495
11496       for (unsigned int byte = 0; byte < innersize; byte++)
11497         {
11498           bytes[idx++] = (elpart & 0xff) ^ invmask;
11499           elpart >>= BITS_PER_UNIT;
11500         }
11501
11502     }
11503
11504   /* Sanity check.  */
11505   gcc_assert (idx == GET_MODE_SIZE (mode));
11506
11507   do
11508     {
11509       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11510              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11511
11512       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11513              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11514
11515       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11516              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11517
11518       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11519              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11520
11521       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11522
11523       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11524
11525       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11526              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11527
11528       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11529              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11530
11531       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11532              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11533
11534       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11535              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11536
11537       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11538
11539       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11540
11541       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11542              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11543
11544       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11545              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11546
11547       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11548              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11549
11550       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11551              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11552
11553       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11554
11555       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11556              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11557     }
11558   while (0);
11559
11560   if (immtype == -1)
11561     return false;
11562
11563   if (info)
11564     {
11565       info->element_width = elsize;
11566       info->mvn = emvn != 0;
11567       info->shift = eshift;
11568
11569       unsigned HOST_WIDE_INT imm = 0;
11570
11571       if (immtype >= 12 && immtype <= 15)
11572         info->msl = true;
11573
11574       /* Un-invert bytes of recognized vector, if necessary.  */
11575       if (invmask != 0)
11576         for (i = 0; i < idx; i++)
11577           bytes[i] ^= invmask;
11578
11579       if (immtype == 17)
11580         {
11581           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11582           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11583
11584           for (i = 0; i < 8; i++)
11585             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11586               << (i * BITS_PER_UNIT);
11587
11588
11589           info->value = GEN_INT (imm);
11590         }
11591       else
11592         {
11593           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11594             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11595
11596           /* Construct 'abcdefgh' because the assembler cannot handle
11597              generic constants.  */
11598           if (info->mvn)
11599             imm = ~imm;
11600           imm = (imm >> info->shift) & 0xff;
11601           info->value = GEN_INT (imm);
11602         }
11603     }
11604
11605   return true;
11606 #undef CHECK
11607 }
11608
11609 /* Check of immediate shift constants are within range.  */
11610 bool
11611 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11612 {
11613   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11614   if (left)
11615     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11616   else
11617     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11618 }
11619
11620 /* Return true if X is a uniform vector where all elements
11621    are either the floating-point constant 0.0 or the
11622    integer constant 0.  */
11623 bool
11624 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11625 {
11626   return x == CONST0_RTX (mode);
11627 }
11628
11629
11630 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11631    operation of width WIDTH at bit position POS.  */
11632
11633 rtx
11634 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11635 {
11636   gcc_assert (CONST_INT_P (width));
11637   gcc_assert (CONST_INT_P (pos));
11638
11639   unsigned HOST_WIDE_INT mask
11640     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11641   return GEN_INT (mask << UINTVAL (pos));
11642 }
11643
11644 bool
11645 aarch64_mov_operand_p (rtx x, machine_mode mode)
11646 {
11647   if (GET_CODE (x) == HIGH
11648       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11649     return true;
11650
11651   if (CONST_INT_P (x))
11652     return true;
11653
11654   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11655     return true;
11656
11657   return aarch64_classify_symbolic_expression (x)
11658     == SYMBOL_TINY_ABSOLUTE;
11659 }
11660
11661 /* Return a const_int vector of VAL.  */
11662 rtx
11663 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11664 {
11665   int nunits = GET_MODE_NUNITS (mode);
11666   rtvec v = rtvec_alloc (nunits);
11667   int i;
11668
11669   rtx cache = GEN_INT (val);
11670
11671   for (i=0; i < nunits; i++)
11672     RTVEC_ELT (v, i) = cache;
11673
11674   return gen_rtx_CONST_VECTOR (mode, v);
11675 }
11676
11677 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11678
11679 bool
11680 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11681 {
11682   machine_mode vmode;
11683
11684   gcc_assert (!VECTOR_MODE_P (mode));
11685   vmode = aarch64_preferred_simd_mode (mode);
11686   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11687   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11688 }
11689
11690 /* Construct and return a PARALLEL RTX vector with elements numbering the
11691    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11692    the vector - from the perspective of the architecture.  This does not
11693    line up with GCC's perspective on lane numbers, so we end up with
11694    different masks depending on our target endian-ness.  The diagram
11695    below may help.  We must draw the distinction when building masks
11696    which select one half of the vector.  An instruction selecting
11697    architectural low-lanes for a big-endian target, must be described using
11698    a mask selecting GCC high-lanes.
11699
11700                  Big-Endian             Little-Endian
11701
11702 GCC             0   1   2   3           3   2   1   0
11703               | x | x | x | x |       | x | x | x | x |
11704 Architecture    3   2   1   0           3   2   1   0
11705
11706 Low Mask:         { 2, 3 }                { 0, 1 }
11707 High Mask:        { 0, 1 }                { 2, 3 }
11708 */
11709
11710 rtx
11711 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11712 {
11713   int nunits = GET_MODE_NUNITS (mode);
11714   rtvec v = rtvec_alloc (nunits / 2);
11715   int high_base = nunits / 2;
11716   int low_base = 0;
11717   int base;
11718   rtx t1;
11719   int i;
11720
11721   if (BYTES_BIG_ENDIAN)
11722     base = high ? low_base : high_base;
11723   else
11724     base = high ? high_base : low_base;
11725
11726   for (i = 0; i < nunits / 2; i++)
11727     RTVEC_ELT (v, i) = GEN_INT (base + i);
11728
11729   t1 = gen_rtx_PARALLEL (mode, v);
11730   return t1;
11731 }
11732
11733 /* Check OP for validity as a PARALLEL RTX vector with elements
11734    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11735    from the perspective of the architecture.  See the diagram above
11736    aarch64_simd_vect_par_cnst_half for more details.  */
11737
11738 bool
11739 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11740                                        bool high)
11741 {
11742   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11743   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11744   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11745   int i = 0;
11746
11747   if (!VECTOR_MODE_P (mode))
11748     return false;
11749
11750   if (count_op != count_ideal)
11751     return false;
11752
11753   for (i = 0; i < count_ideal; i++)
11754     {
11755       rtx elt_op = XVECEXP (op, 0, i);
11756       rtx elt_ideal = XVECEXP (ideal, 0, i);
11757
11758       if (!CONST_INT_P (elt_op)
11759           || INTVAL (elt_ideal) != INTVAL (elt_op))
11760         return false;
11761     }
11762   return true;
11763 }
11764
11765 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11766    HIGH (exclusive).  */
11767 void
11768 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11769                           const_tree exp)
11770 {
11771   HOST_WIDE_INT lane;
11772   gcc_assert (CONST_INT_P (operand));
11773   lane = INTVAL (operand);
11774
11775   if (lane < low || lane >= high)
11776   {
11777     if (exp)
11778       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11779     else
11780       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11781   }
11782 }
11783
11784 /* Return TRUE if OP is a valid vector addressing mode.  */
11785 bool
11786 aarch64_simd_mem_operand_p (rtx op)
11787 {
11788   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11789                         || REG_P (XEXP (op, 0)));
11790 }
11791
11792 /* Emit a register copy from operand to operand, taking care not to
11793    early-clobber source registers in the process.
11794
11795    COUNT is the number of components into which the copy needs to be
11796    decomposed.  */
11797 void
11798 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
11799                                 unsigned int count)
11800 {
11801   unsigned int i;
11802   int rdest = REGNO (operands[0]);
11803   int rsrc = REGNO (operands[1]);
11804
11805   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11806       || rdest < rsrc)
11807     for (i = 0; i < count; i++)
11808       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11809                       gen_rtx_REG (mode, rsrc + i));
11810   else
11811     for (i = 0; i < count; i++)
11812       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11813                       gen_rtx_REG (mode, rsrc + count - i - 1));
11814 }
11815
11816 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11817    one of VSTRUCT modes: OI, CI, or XI.  */
11818 int
11819 aarch64_simd_attr_length_rglist (machine_mode mode)
11820 {
11821   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11822 }
11823
11824 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11825    alignment of a vector to 128 bits.  */
11826 static HOST_WIDE_INT
11827 aarch64_simd_vector_alignment (const_tree type)
11828 {
11829   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11830   return MIN (align, 128);
11831 }
11832
11833 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11834 static bool
11835 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11836 {
11837   if (is_packed)
11838     return false;
11839
11840   /* We guarantee alignment for vectors up to 128-bits.  */
11841   if (tree_int_cst_compare (TYPE_SIZE (type),
11842                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11843     return false;
11844
11845   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11846   return true;
11847 }
11848
11849 /* Return true if the vector misalignment factor is supported by the
11850    target.  */
11851 static bool
11852 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11853                                              const_tree type, int misalignment,
11854                                              bool is_packed)
11855 {
11856   if (TARGET_SIMD && STRICT_ALIGNMENT)
11857     {
11858       /* Return if movmisalign pattern is not supported for this mode.  */
11859       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11860         return false;
11861
11862       if (misalignment == -1)
11863         {
11864           /* Misalignment factor is unknown at compile time but we know
11865              it's word aligned.  */
11866           if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11867             {
11868               int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11869
11870               if (element_size != 64)
11871                 return true;
11872             }
11873           return false;
11874         }
11875     }
11876   return default_builtin_support_vector_misalignment (mode, type, misalignment,
11877                                                       is_packed);
11878 }
11879
11880 /* If VALS is a vector constant that can be loaded into a register
11881    using DUP, generate instructions to do so and return an RTX to
11882    assign to the register.  Otherwise return NULL_RTX.  */
11883 static rtx
11884 aarch64_simd_dup_constant (rtx vals)
11885 {
11886   machine_mode mode = GET_MODE (vals);
11887   machine_mode inner_mode = GET_MODE_INNER (mode);
11888   rtx x;
11889
11890   if (!const_vec_duplicate_p (vals, &x))
11891     return NULL_RTX;
11892
11893   /* We can load this constant by using DUP and a constant in a
11894      single ARM register.  This will be cheaper than a vector
11895      load.  */
11896   x = copy_to_mode_reg (inner_mode, x);
11897   return gen_rtx_VEC_DUPLICATE (mode, x);
11898 }
11899
11900
11901 /* Generate code to load VALS, which is a PARALLEL containing only
11902    constants (for vec_init) or CONST_VECTOR, efficiently into a
11903    register.  Returns an RTX to copy into the register, or NULL_RTX
11904    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11905 static rtx
11906 aarch64_simd_make_constant (rtx vals)
11907 {
11908   machine_mode mode = GET_MODE (vals);
11909   rtx const_dup;
11910   rtx const_vec = NULL_RTX;
11911   int n_elts = GET_MODE_NUNITS (mode);
11912   int n_const = 0;
11913   int i;
11914
11915   if (GET_CODE (vals) == CONST_VECTOR)
11916     const_vec = vals;
11917   else if (GET_CODE (vals) == PARALLEL)
11918     {
11919       /* A CONST_VECTOR must contain only CONST_INTs and
11920          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11921          Only store valid constants in a CONST_VECTOR.  */
11922       for (i = 0; i < n_elts; ++i)
11923         {
11924           rtx x = XVECEXP (vals, 0, i);
11925           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11926             n_const++;
11927         }
11928       if (n_const == n_elts)
11929         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11930     }
11931   else
11932     gcc_unreachable ();
11933
11934   if (const_vec != NULL_RTX
11935       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11936     /* Load using MOVI/MVNI.  */
11937     return const_vec;
11938   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11939     /* Loaded using DUP.  */
11940     return const_dup;
11941   else if (const_vec != NULL_RTX)
11942     /* Load from constant pool. We can not take advantage of single-cycle
11943        LD1 because we need a PC-relative addressing mode.  */
11944     return const_vec;
11945   else
11946     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11947        We can not construct an initializer.  */
11948     return NULL_RTX;
11949 }
11950
11951 /* Expand a vector initialisation sequence, such that TARGET is
11952    initialised to contain VALS.  */
11953
11954 void
11955 aarch64_expand_vector_init (rtx target, rtx vals)
11956 {
11957   machine_mode mode = GET_MODE (target);
11958   machine_mode inner_mode = GET_MODE_INNER (mode);
11959   /* The number of vector elements.  */
11960   int n_elts = GET_MODE_NUNITS (mode);
11961   /* The number of vector elements which are not constant.  */
11962   int n_var = 0;
11963   rtx any_const = NULL_RTX;
11964   /* The first element of vals.  */
11965   rtx v0 = XVECEXP (vals, 0, 0);
11966   bool all_same = true;
11967
11968   /* Count the number of variable elements to initialise.  */
11969   for (int i = 0; i < n_elts; ++i)
11970     {
11971       rtx x = XVECEXP (vals, 0, i);
11972       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11973         ++n_var;
11974       else
11975         any_const = x;
11976
11977       all_same &= rtx_equal_p (x, v0);
11978     }
11979
11980   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11981      how best to handle this.  */
11982   if (n_var == 0)
11983     {
11984       rtx constant = aarch64_simd_make_constant (vals);
11985       if (constant != NULL_RTX)
11986         {
11987           emit_move_insn (target, constant);
11988           return;
11989         }
11990     }
11991
11992   /* Splat a single non-constant element if we can.  */
11993   if (all_same)
11994     {
11995       rtx x = copy_to_mode_reg (inner_mode, v0);
11996       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11997       return;
11998     }
11999
12000   enum insn_code icode = optab_handler (vec_set_optab, mode);
12001   gcc_assert (icode != CODE_FOR_nothing);
12002
12003   /* If there are only variable elements, try to optimize
12004      the insertion using dup for the most common element
12005      followed by insertions.  */
12006
12007   /* The algorithm will fill matches[*][0] with the earliest matching element,
12008      and matches[X][1] with the count of duplicate elements (if X is the
12009      earliest element which has duplicates).  */
12010
12011   if (n_var == n_elts && n_elts <= 16)
12012     {
12013       int matches[16][2] = {0};
12014       for (int i = 0; i < n_elts; i++)
12015         {
12016           for (int j = 0; j <= i; j++)
12017             {
12018               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
12019                 {
12020                   matches[i][0] = j;
12021                   matches[j][1]++;
12022                   break;
12023                 }
12024             }
12025         }
12026       int maxelement = 0;
12027       int maxv = 0;
12028       for (int i = 0; i < n_elts; i++)
12029         if (matches[i][1] > maxv)
12030           {
12031             maxelement = i;
12032             maxv = matches[i][1];
12033           }
12034
12035       /* Create a duplicate of the most common element.  */
12036       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
12037       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
12038
12039       /* Insert the rest.  */
12040       for (int i = 0; i < n_elts; i++)
12041         {
12042           rtx x = XVECEXP (vals, 0, i);
12043           if (matches[i][0] == maxelement)
12044             continue;
12045           x = copy_to_mode_reg (inner_mode, x);
12046           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12047         }
12048       return;
12049     }
12050
12051   /* Initialise a vector which is part-variable.  We want to first try
12052      to build those lanes which are constant in the most efficient way we
12053      can.  */
12054   if (n_var != n_elts)
12055     {
12056       rtx copy = copy_rtx (vals);
12057
12058       /* Load constant part of vector.  We really don't care what goes into the
12059          parts we will overwrite, but we're more likely to be able to load the
12060          constant efficiently if it has fewer, larger, repeating parts
12061          (see aarch64_simd_valid_immediate).  */
12062       for (int i = 0; i < n_elts; i++)
12063         {
12064           rtx x = XVECEXP (vals, 0, i);
12065           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12066             continue;
12067           rtx subst = any_const;
12068           for (int bit = n_elts / 2; bit > 0; bit /= 2)
12069             {
12070               /* Look in the copied vector, as more elements are const.  */
12071               rtx test = XVECEXP (copy, 0, i ^ bit);
12072               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
12073                 {
12074                   subst = test;
12075                   break;
12076                 }
12077             }
12078           XVECEXP (copy, 0, i) = subst;
12079         }
12080       aarch64_expand_vector_init (target, copy);
12081     }
12082
12083   /* Insert the variable lanes directly.  */
12084   for (int i = 0; i < n_elts; i++)
12085     {
12086       rtx x = XVECEXP (vals, 0, i);
12087       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
12088         continue;
12089       x = copy_to_mode_reg (inner_mode, x);
12090       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
12091     }
12092 }
12093
12094 static unsigned HOST_WIDE_INT
12095 aarch64_shift_truncation_mask (machine_mode mode)
12096 {
12097   return
12098     (!SHIFT_COUNT_TRUNCATED
12099      || aarch64_vector_mode_supported_p (mode)
12100      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
12101 }
12102
12103 /* Select a format to encode pointers in exception handling data.  */
12104 int
12105 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
12106 {
12107    int type;
12108    switch (aarch64_cmodel)
12109      {
12110      case AARCH64_CMODEL_TINY:
12111      case AARCH64_CMODEL_TINY_PIC:
12112      case AARCH64_CMODEL_SMALL:
12113      case AARCH64_CMODEL_SMALL_PIC:
12114      case AARCH64_CMODEL_SMALL_SPIC:
12115        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
12116           for everything.  */
12117        type = DW_EH_PE_sdata4;
12118        break;
12119      default:
12120        /* No assumptions here.  8-byte relocs required.  */
12121        type = DW_EH_PE_sdata8;
12122        break;
12123      }
12124    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
12125 }
12126
12127 /* The last .arch and .tune assembly strings that we printed.  */
12128 static std::string aarch64_last_printed_arch_string;
12129 static std::string aarch64_last_printed_tune_string;
12130
12131 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
12132    by the function fndecl.  */
12133
12134 void
12135 aarch64_declare_function_name (FILE *stream, const char* name,
12136                                 tree fndecl)
12137 {
12138   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12139
12140   struct cl_target_option *targ_options;
12141   if (target_parts)
12142     targ_options = TREE_TARGET_OPTION (target_parts);
12143   else
12144     targ_options = TREE_TARGET_OPTION (target_option_current_node);
12145   gcc_assert (targ_options);
12146
12147   const struct processor *this_arch
12148     = aarch64_get_arch (targ_options->x_explicit_arch);
12149
12150   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
12151   std::string extension
12152     = aarch64_get_extension_string_for_isa_flags (isa_flags,
12153                                                   this_arch->flags);
12154   /* Only update the assembler .arch string if it is distinct from the last
12155      such string we printed.  */
12156   std::string to_print = this_arch->name + extension;
12157   if (to_print != aarch64_last_printed_arch_string)
12158     {
12159       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
12160       aarch64_last_printed_arch_string = to_print;
12161     }
12162
12163   /* Print the cpu name we're tuning for in the comments, might be
12164      useful to readers of the generated asm.  Do it only when it changes
12165      from function to function and verbose assembly is requested.  */
12166   const struct processor *this_tune
12167     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
12168
12169   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
12170     {
12171       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
12172                    this_tune->name);
12173       aarch64_last_printed_tune_string = this_tune->name;
12174     }
12175
12176   /* Don't forget the type directive for ELF.  */
12177   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
12178   ASM_OUTPUT_LABEL (stream, name);
12179 }
12180
12181 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
12182
12183 static void
12184 aarch64_start_file (void)
12185 {
12186   struct cl_target_option *default_options
12187     = TREE_TARGET_OPTION (target_option_default_node);
12188
12189   const struct processor *default_arch
12190     = aarch64_get_arch (default_options->x_explicit_arch);
12191   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
12192   std::string extension
12193     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
12194                                                   default_arch->flags);
12195
12196    aarch64_last_printed_arch_string = default_arch->name + extension;
12197    aarch64_last_printed_tune_string = "";
12198    asm_fprintf (asm_out_file, "\t.arch %s\n",
12199                 aarch64_last_printed_arch_string.c_str ());
12200
12201    default_file_start ();
12202 }
12203
12204 /* Emit load exclusive.  */
12205
12206 static void
12207 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
12208                              rtx mem, rtx model_rtx)
12209 {
12210   rtx (*gen) (rtx, rtx, rtx);
12211
12212   switch (mode)
12213     {
12214     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
12215     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
12216     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
12217     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
12218     default:
12219       gcc_unreachable ();
12220     }
12221
12222   emit_insn (gen (rval, mem, model_rtx));
12223 }
12224
12225 /* Emit store exclusive.  */
12226
12227 static void
12228 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
12229                               rtx rval, rtx mem, rtx model_rtx)
12230 {
12231   rtx (*gen) (rtx, rtx, rtx, rtx);
12232
12233   switch (mode)
12234     {
12235     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
12236     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
12237     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
12238     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
12239     default:
12240       gcc_unreachable ();
12241     }
12242
12243   emit_insn (gen (bval, rval, mem, model_rtx));
12244 }
12245
12246 /* Mark the previous jump instruction as unlikely.  */
12247
12248 static void
12249 aarch64_emit_unlikely_jump (rtx insn)
12250 {
12251   rtx_insn *jump = emit_jump_insn (insn);
12252   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
12253 }
12254
12255 /* Expand a compare and swap pattern.  */
12256
12257 void
12258 aarch64_expand_compare_and_swap (rtx operands[])
12259 {
12260   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
12261   machine_mode mode, cmp_mode;
12262   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
12263   int idx;
12264   gen_cas_fn gen;
12265   const gen_cas_fn split_cas[] =
12266   {
12267     gen_aarch64_compare_and_swapqi,
12268     gen_aarch64_compare_and_swaphi,
12269     gen_aarch64_compare_and_swapsi,
12270     gen_aarch64_compare_and_swapdi
12271   };
12272   const gen_cas_fn atomic_cas[] =
12273   {
12274     gen_aarch64_compare_and_swapqi_lse,
12275     gen_aarch64_compare_and_swaphi_lse,
12276     gen_aarch64_compare_and_swapsi_lse,
12277     gen_aarch64_compare_and_swapdi_lse
12278   };
12279
12280   bval = operands[0];
12281   rval = operands[1];
12282   mem = operands[2];
12283   oldval = operands[3];
12284   newval = operands[4];
12285   is_weak = operands[5];
12286   mod_s = operands[6];
12287   mod_f = operands[7];
12288   mode = GET_MODE (mem);
12289   cmp_mode = mode;
12290
12291   /* Normally the succ memory model must be stronger than fail, but in the
12292      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12293      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
12294
12295   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
12296       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
12297     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
12298
12299   switch (mode)
12300     {
12301     case E_QImode:
12302     case E_HImode:
12303       /* For short modes, we're going to perform the comparison in SImode,
12304          so do the zero-extension now.  */
12305       cmp_mode = SImode;
12306       rval = gen_reg_rtx (SImode);
12307       oldval = convert_modes (SImode, mode, oldval, true);
12308       /* Fall through.  */
12309
12310     case E_SImode:
12311     case E_DImode:
12312       /* Force the value into a register if needed.  */
12313       if (!aarch64_plus_operand (oldval, mode))
12314         oldval = force_reg (cmp_mode, oldval);
12315       break;
12316
12317     default:
12318       gcc_unreachable ();
12319     }
12320
12321   switch (mode)
12322     {
12323     case E_QImode: idx = 0; break;
12324     case E_HImode: idx = 1; break;
12325     case E_SImode: idx = 2; break;
12326     case E_DImode: idx = 3; break;
12327     default:
12328       gcc_unreachable ();
12329     }
12330   if (TARGET_LSE)
12331     gen = atomic_cas[idx];
12332   else
12333     gen = split_cas[idx];
12334
12335   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
12336
12337   if (mode == QImode || mode == HImode)
12338     emit_move_insn (operands[1], gen_lowpart (mode, rval));
12339
12340   x = gen_rtx_REG (CCmode, CC_REGNUM);
12341   x = gen_rtx_EQ (SImode, x, const0_rtx);
12342   emit_insn (gen_rtx_SET (bval, x));
12343 }
12344
12345 /* Test whether the target supports using a atomic load-operate instruction.
12346    CODE is the operation and AFTER is TRUE if the data in memory after the
12347    operation should be returned and FALSE if the data before the operation
12348    should be returned.  Returns FALSE if the operation isn't supported by the
12349    architecture.  */
12350
12351 bool
12352 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12353 {
12354   if (!TARGET_LSE)
12355     return false;
12356
12357   switch (code)
12358     {
12359     case SET:
12360     case AND:
12361     case IOR:
12362     case XOR:
12363     case MINUS:
12364     case PLUS:
12365       return true;
12366     default:
12367       return false;
12368     }
12369 }
12370
12371 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12372    sequence implementing an atomic operation.  */
12373
12374 static void
12375 aarch64_emit_post_barrier (enum memmodel model)
12376 {
12377   const enum memmodel base_model = memmodel_base (model);
12378
12379   if (is_mm_sync (model)
12380       && (base_model == MEMMODEL_ACQUIRE
12381           || base_model == MEMMODEL_ACQ_REL
12382           || base_model == MEMMODEL_SEQ_CST))
12383     {
12384       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12385     }
12386 }
12387
12388 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
12389    for the data in memory.  EXPECTED is the value expected to be in memory.
12390    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
12391    is the memory ordering to use.  */
12392
12393 void
12394 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12395                         rtx expected, rtx desired,
12396                         rtx model)
12397 {
12398   rtx (*gen) (rtx, rtx, rtx, rtx);
12399   machine_mode mode;
12400
12401   mode = GET_MODE (mem);
12402
12403   switch (mode)
12404     {
12405     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
12406     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
12407     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
12408     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
12409     default:
12410       gcc_unreachable ();
12411     }
12412
12413   /* Move the expected value into the CAS destination register.  */
12414   emit_insn (gen_rtx_SET (rval, expected));
12415
12416   /* Emit the CAS.  */
12417   emit_insn (gen (rval, mem, desired, model));
12418
12419   /* Compare the expected value with the value loaded by the CAS, to establish
12420      whether the swap was made.  */
12421   aarch64_gen_compare_reg (EQ, rval, expected);
12422 }
12423
12424 /* Split a compare and swap pattern.  */
12425
12426 void
12427 aarch64_split_compare_and_swap (rtx operands[])
12428 {
12429   rtx rval, mem, oldval, newval, scratch;
12430   machine_mode mode;
12431   bool is_weak;
12432   rtx_code_label *label1, *label2;
12433   rtx x, cond;
12434   enum memmodel model;
12435   rtx model_rtx;
12436
12437   rval = operands[0];
12438   mem = operands[1];
12439   oldval = operands[2];
12440   newval = operands[3];
12441   is_weak = (operands[4] != const0_rtx);
12442   model_rtx = operands[5];
12443   scratch = operands[7];
12444   mode = GET_MODE (mem);
12445   model = memmodel_from_int (INTVAL (model_rtx));
12446
12447   /* When OLDVAL is zero and we want the strong version we can emit a tighter
12448     loop:
12449     .label1:
12450         LD[A]XR rval, [mem]
12451         CBNZ    rval, .label2
12452         ST[L]XR scratch, newval, [mem]
12453         CBNZ    scratch, .label1
12454     .label2:
12455         CMP     rval, 0.  */
12456   bool strong_zero_p = !is_weak && oldval == const0_rtx;
12457
12458   label1 = NULL;
12459   if (!is_weak)
12460     {
12461       label1 = gen_label_rtx ();
12462       emit_label (label1);
12463     }
12464   label2 = gen_label_rtx ();
12465
12466   /* The initial load can be relaxed for a __sync operation since a final
12467      barrier will be emitted to stop code hoisting.  */
12468   if (is_mm_sync (model))
12469     aarch64_emit_load_exclusive (mode, rval, mem,
12470                                  GEN_INT (MEMMODEL_RELAXED));
12471   else
12472     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12473
12474   if (strong_zero_p)
12475     {
12476       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
12477       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12478                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12479       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12480     }
12481   else
12482     {
12483       cond = aarch64_gen_compare_reg (NE, rval, oldval);
12484       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12485       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12486                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12487       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12488     }
12489
12490   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12491
12492   if (!is_weak)
12493     {
12494       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12495       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12496                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12497       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12498     }
12499   else
12500     {
12501       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12502       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12503       emit_insn (gen_rtx_SET (cond, x));
12504     }
12505
12506   emit_label (label2);
12507   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12508      to set the condition flags.  If this is not used it will be removed by
12509      later passes.  */
12510   if (strong_zero_p)
12511     {
12512       cond = gen_rtx_REG (CCmode, CC_REGNUM);
12513       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
12514       emit_insn (gen_rtx_SET (cond, x));
12515     }
12516   /* Emit any final barrier needed for a __sync operation.  */
12517   if (is_mm_sync (model))
12518     aarch64_emit_post_barrier (model);
12519 }
12520
12521 /* Emit a BIC instruction.  */
12522
12523 static void
12524 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12525 {
12526   rtx shift_rtx = GEN_INT (shift);
12527   rtx (*gen) (rtx, rtx, rtx, rtx);
12528
12529   switch (mode)
12530     {
12531     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12532     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12533     default:
12534       gcc_unreachable ();
12535     }
12536
12537   emit_insn (gen (dst, s2, shift_rtx, s1));
12538 }
12539
12540 /* Emit an atomic swap.  */
12541
12542 static void
12543 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12544                           rtx mem, rtx model)
12545 {
12546   rtx (*gen) (rtx, rtx, rtx, rtx);
12547
12548   switch (mode)
12549     {
12550     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
12551     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
12552     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
12553     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
12554     default:
12555       gcc_unreachable ();
12556     }
12557
12558   emit_insn (gen (dst, mem, value, model));
12559 }
12560
12561 /* Operations supported by aarch64_emit_atomic_load_op.  */
12562
12563 enum aarch64_atomic_load_op_code
12564 {
12565   AARCH64_LDOP_PLUS,    /* A + B  */
12566   AARCH64_LDOP_XOR,     /* A ^ B  */
12567   AARCH64_LDOP_OR,      /* A | B  */
12568   AARCH64_LDOP_BIC      /* A & ~B  */
12569 };
12570
12571 /* Emit an atomic load-operate.  */
12572
12573 static void
12574 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12575                              machine_mode mode, rtx dst, rtx src,
12576                              rtx mem, rtx model)
12577 {
12578   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12579   const aarch64_atomic_load_op_fn plus[] =
12580   {
12581     gen_aarch64_atomic_loadaddqi,
12582     gen_aarch64_atomic_loadaddhi,
12583     gen_aarch64_atomic_loadaddsi,
12584     gen_aarch64_atomic_loadadddi
12585   };
12586   const aarch64_atomic_load_op_fn eor[] =
12587   {
12588     gen_aarch64_atomic_loadeorqi,
12589     gen_aarch64_atomic_loadeorhi,
12590     gen_aarch64_atomic_loadeorsi,
12591     gen_aarch64_atomic_loadeordi
12592   };
12593   const aarch64_atomic_load_op_fn ior[] =
12594   {
12595     gen_aarch64_atomic_loadsetqi,
12596     gen_aarch64_atomic_loadsethi,
12597     gen_aarch64_atomic_loadsetsi,
12598     gen_aarch64_atomic_loadsetdi
12599   };
12600   const aarch64_atomic_load_op_fn bic[] =
12601   {
12602     gen_aarch64_atomic_loadclrqi,
12603     gen_aarch64_atomic_loadclrhi,
12604     gen_aarch64_atomic_loadclrsi,
12605     gen_aarch64_atomic_loadclrdi
12606   };
12607   aarch64_atomic_load_op_fn gen;
12608   int idx = 0;
12609
12610   switch (mode)
12611     {
12612     case E_QImode: idx = 0; break;
12613     case E_HImode: idx = 1; break;
12614     case E_SImode: idx = 2; break;
12615     case E_DImode: idx = 3; break;
12616     default:
12617       gcc_unreachable ();
12618     }
12619
12620   switch (code)
12621     {
12622     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12623     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12624     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12625     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12626     default:
12627       gcc_unreachable ();
12628     }
12629
12630   emit_insn (gen (dst, mem, src, model));
12631 }
12632
12633 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12634    location to store the data read from memory.  OUT_RESULT is the location to
12635    store the result of the operation.  MEM is the memory location to read and
12636    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12637    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12638    be NULL.  */
12639
12640 void
12641 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12642                          rtx mem, rtx value, rtx model_rtx)
12643 {
12644   machine_mode mode = GET_MODE (mem);
12645   machine_mode wmode = (mode == DImode ? DImode : SImode);
12646   const bool short_mode = (mode < SImode);
12647   aarch64_atomic_load_op_code ldop_code;
12648   rtx src;
12649   rtx x;
12650
12651   if (out_data)
12652     out_data = gen_lowpart (mode, out_data);
12653
12654   if (out_result)
12655     out_result = gen_lowpart (mode, out_result);
12656
12657   /* Make sure the value is in a register, putting it into a destination
12658      register if it needs to be manipulated.  */
12659   if (!register_operand (value, mode)
12660       || code == AND || code == MINUS)
12661     {
12662       src = out_result ? out_result : out_data;
12663       emit_move_insn (src, gen_lowpart (mode, value));
12664     }
12665   else
12666     src = value;
12667   gcc_assert (register_operand (src, mode));
12668
12669   /* Preprocess the data for the operation as necessary.  If the operation is
12670      a SET then emit a swap instruction and finish.  */
12671   switch (code)
12672     {
12673     case SET:
12674       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12675       return;
12676
12677     case MINUS:
12678       /* Negate the value and treat it as a PLUS.  */
12679       {
12680         rtx neg_src;
12681
12682         /* Resize the value if necessary.  */
12683         if (short_mode)
12684           src = gen_lowpart (wmode, src);
12685
12686         neg_src = gen_rtx_NEG (wmode, src);
12687         emit_insn (gen_rtx_SET (src, neg_src));
12688
12689         if (short_mode)
12690           src = gen_lowpart (mode, src);
12691       }
12692       /* Fall-through.  */
12693     case PLUS:
12694       ldop_code = AARCH64_LDOP_PLUS;
12695       break;
12696
12697     case IOR:
12698       ldop_code = AARCH64_LDOP_OR;
12699       break;
12700
12701     case XOR:
12702       ldop_code = AARCH64_LDOP_XOR;
12703       break;
12704
12705     case AND:
12706       {
12707         rtx not_src;
12708
12709         /* Resize the value if necessary.  */
12710         if (short_mode)
12711           src = gen_lowpart (wmode, src);
12712
12713         not_src = gen_rtx_NOT (wmode, src);
12714         emit_insn (gen_rtx_SET (src, not_src));
12715
12716         if (short_mode)
12717           src = gen_lowpart (mode, src);
12718       }
12719       ldop_code = AARCH64_LDOP_BIC;
12720       break;
12721
12722     default:
12723       /* The operation can't be done with atomic instructions.  */
12724       gcc_unreachable ();
12725     }
12726
12727   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12728
12729   /* If necessary, calculate the data in memory after the update by redoing the
12730      operation from values in registers.  */
12731   if (!out_result)
12732     return;
12733
12734   if (short_mode)
12735     {
12736       src = gen_lowpart (wmode, src);
12737       out_data = gen_lowpart (wmode, out_data);
12738       out_result = gen_lowpart (wmode, out_result);
12739     }
12740
12741   x = NULL_RTX;
12742
12743   switch (code)
12744     {
12745     case MINUS:
12746     case PLUS:
12747       x = gen_rtx_PLUS (wmode, out_data, src);
12748       break;
12749     case IOR:
12750       x = gen_rtx_IOR (wmode, out_data, src);
12751       break;
12752     case XOR:
12753       x = gen_rtx_XOR (wmode, out_data, src);
12754       break;
12755     case AND:
12756       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12757       return;
12758     default:
12759       gcc_unreachable ();
12760     }
12761
12762   emit_set_insn (out_result, x);
12763
12764   return;
12765 }
12766
12767 /* Split an atomic operation.  */
12768
12769 void
12770 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12771                          rtx value, rtx model_rtx, rtx cond)
12772 {
12773   machine_mode mode = GET_MODE (mem);
12774   machine_mode wmode = (mode == DImode ? DImode : SImode);
12775   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12776   const bool is_sync = is_mm_sync (model);
12777   rtx_code_label *label;
12778   rtx x;
12779
12780   /* Split the atomic operation into a sequence.  */
12781   label = gen_label_rtx ();
12782   emit_label (label);
12783
12784   if (new_out)
12785     new_out = gen_lowpart (wmode, new_out);
12786   if (old_out)
12787     old_out = gen_lowpart (wmode, old_out);
12788   else
12789     old_out = new_out;
12790   value = simplify_gen_subreg (wmode, value, mode, 0);
12791
12792   /* The initial load can be relaxed for a __sync operation since a final
12793      barrier will be emitted to stop code hoisting.  */
12794  if (is_sync)
12795     aarch64_emit_load_exclusive (mode, old_out, mem,
12796                                  GEN_INT (MEMMODEL_RELAXED));
12797   else
12798     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12799
12800   switch (code)
12801     {
12802     case SET:
12803       new_out = value;
12804       break;
12805
12806     case NOT:
12807       x = gen_rtx_AND (wmode, old_out, value);
12808       emit_insn (gen_rtx_SET (new_out, x));
12809       x = gen_rtx_NOT (wmode, new_out);
12810       emit_insn (gen_rtx_SET (new_out, x));
12811       break;
12812
12813     case MINUS:
12814       if (CONST_INT_P (value))
12815         {
12816           value = GEN_INT (-INTVAL (value));
12817           code = PLUS;
12818         }
12819       /* Fall through.  */
12820
12821     default:
12822       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12823       emit_insn (gen_rtx_SET (new_out, x));
12824       break;
12825     }
12826
12827   aarch64_emit_store_exclusive (mode, cond, mem,
12828                                 gen_lowpart (mode, new_out), model_rtx);
12829
12830   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12831   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12832                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12833   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12834
12835   /* Emit any final barrier needed for a __sync operation.  */
12836   if (is_sync)
12837     aarch64_emit_post_barrier (model);
12838 }
12839
12840 static void
12841 aarch64_init_libfuncs (void)
12842 {
12843    /* Half-precision float operations.  The compiler handles all operations
12844      with NULL libfuncs by converting to SFmode.  */
12845
12846   /* Conversions.  */
12847   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12848   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12849
12850   /* Arithmetic.  */
12851   set_optab_libfunc (add_optab, HFmode, NULL);
12852   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12853   set_optab_libfunc (smul_optab, HFmode, NULL);
12854   set_optab_libfunc (neg_optab, HFmode, NULL);
12855   set_optab_libfunc (sub_optab, HFmode, NULL);
12856
12857   /* Comparisons.  */
12858   set_optab_libfunc (eq_optab, HFmode, NULL);
12859   set_optab_libfunc (ne_optab, HFmode, NULL);
12860   set_optab_libfunc (lt_optab, HFmode, NULL);
12861   set_optab_libfunc (le_optab, HFmode, NULL);
12862   set_optab_libfunc (ge_optab, HFmode, NULL);
12863   set_optab_libfunc (gt_optab, HFmode, NULL);
12864   set_optab_libfunc (unord_optab, HFmode, NULL);
12865 }
12866
12867 /* Target hook for c_mode_for_suffix.  */
12868 static machine_mode
12869 aarch64_c_mode_for_suffix (char suffix)
12870 {
12871   if (suffix == 'q')
12872     return TFmode;
12873
12874   return VOIDmode;
12875 }
12876
12877 /* We can only represent floating point constants which will fit in
12878    "quarter-precision" values.  These values are characterised by
12879    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12880    by:
12881
12882    (-1)^s * (n/16) * 2^r
12883
12884    Where:
12885      's' is the sign bit.
12886      'n' is an integer in the range 16 <= n <= 31.
12887      'r' is an integer in the range -3 <= r <= 4.  */
12888
12889 /* Return true iff X can be represented by a quarter-precision
12890    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12891 bool
12892 aarch64_float_const_representable_p (rtx x)
12893 {
12894   /* This represents our current view of how many bits
12895      make up the mantissa.  */
12896   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12897   int exponent;
12898   unsigned HOST_WIDE_INT mantissa, mask;
12899   REAL_VALUE_TYPE r, m;
12900   bool fail;
12901
12902   if (!CONST_DOUBLE_P (x))
12903     return false;
12904
12905   /* We don't support HFmode constants yet.  */
12906   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12907     return false;
12908
12909   r = *CONST_DOUBLE_REAL_VALUE (x);
12910
12911   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12912      know if we have +zero until we analyse the mantissa, but we
12913      can reject the other invalid values.  */
12914   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12915       || REAL_VALUE_MINUS_ZERO (r))
12916     return false;
12917
12918   /* Extract exponent.  */
12919   r = real_value_abs (&r);
12920   exponent = REAL_EXP (&r);
12921
12922   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12923      highest (sign) bit, with a fixed binary point at bit point_pos.
12924      m1 holds the low part of the mantissa, m2 the high part.
12925      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12926      bits for the mantissa, this can fail (low bits will be lost).  */
12927   real_ldexp (&m, &r, point_pos - exponent);
12928   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12929
12930   /* If the low part of the mantissa has bits set we cannot represent
12931      the value.  */
12932   if (w.ulow () != 0)
12933     return false;
12934   /* We have rejected the lower HOST_WIDE_INT, so update our
12935      understanding of how many bits lie in the mantissa and
12936      look only at the high HOST_WIDE_INT.  */
12937   mantissa = w.elt (1);
12938   point_pos -= HOST_BITS_PER_WIDE_INT;
12939
12940   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12941   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12942   if ((mantissa & mask) != 0)
12943     return false;
12944
12945   /* Having filtered unrepresentable values, we may now remove all
12946      but the highest 5 bits.  */
12947   mantissa >>= point_pos - 5;
12948
12949   /* We cannot represent the value 0.0, so reject it.  This is handled
12950      elsewhere.  */
12951   if (mantissa == 0)
12952     return false;
12953
12954   /* Then, as bit 4 is always set, we can mask it off, leaving
12955      the mantissa in the range [0, 15].  */
12956   mantissa &= ~(1 << 4);
12957   gcc_assert (mantissa <= 15);
12958
12959   /* GCC internally does not use IEEE754-like encoding (where normalized
12960      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12961      Our mantissa values are shifted 4 places to the left relative to
12962      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12963      by 5 places to correct for GCC's representation.  */
12964   exponent = 5 - exponent;
12965
12966   return (exponent >= 0 && exponent <= 7);
12967 }
12968
12969 char*
12970 aarch64_output_simd_mov_immediate (rtx const_vector,
12971                                    machine_mode mode,
12972                                    unsigned width)
12973 {
12974   bool is_valid;
12975   static char templ[40];
12976   const char *mnemonic;
12977   const char *shift_op;
12978   unsigned int lane_count = 0;
12979   char element_char;
12980
12981   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12982
12983   /* This will return true to show const_vector is legal for use as either
12984      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12985      also update INFO to show how the immediate should be generated.  */
12986   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12987   gcc_assert (is_valid);
12988
12989   element_char = sizetochar (info.element_width);
12990   lane_count = width / info.element_width;
12991
12992   mode = GET_MODE_INNER (mode);
12993   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12994     {
12995       gcc_assert (info.shift == 0 && ! info.mvn);
12996       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12997          move immediate path.  */
12998       if (aarch64_float_const_zero_rtx_p (info.value))
12999         info.value = GEN_INT (0);
13000       else
13001         {
13002           const unsigned int buf_size = 20;
13003           char float_buf[buf_size] = {'\0'};
13004           real_to_decimal_for_mode (float_buf,
13005                                     CONST_DOUBLE_REAL_VALUE (info.value),
13006                                     buf_size, buf_size, 1, mode);
13007
13008           if (lane_count == 1)
13009             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
13010           else
13011             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
13012                       lane_count, element_char, float_buf);
13013           return templ;
13014         }
13015     }
13016
13017   mnemonic = info.mvn ? "mvni" : "movi";
13018   shift_op = info.msl ? "msl" : "lsl";
13019
13020   gcc_assert (CONST_INT_P (info.value));
13021   if (lane_count == 1)
13022     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
13023               mnemonic, UINTVAL (info.value));
13024   else if (info.shift)
13025     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13026               ", %s %d", mnemonic, lane_count, element_char,
13027               UINTVAL (info.value), shift_op, info.shift);
13028   else
13029     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
13030               mnemonic, lane_count, element_char, UINTVAL (info.value));
13031   return templ;
13032 }
13033
13034 char*
13035 aarch64_output_scalar_simd_mov_immediate (rtx immediate,  machine_mode mode)
13036 {
13037
13038   /* If a floating point number was passed and we desire to use it in an
13039      integer mode do the conversion to integer.  */
13040   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
13041     {
13042       unsigned HOST_WIDE_INT ival;
13043       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
13044           gcc_unreachable ();
13045       immediate = gen_int_mode (ival, mode);
13046     }
13047
13048   machine_mode vmode;
13049   /* use a 64 bit mode for everything except for DI/DF mode, where we use
13050      a 128 bit vector mode.  */
13051   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
13052
13053   gcc_assert (!VECTOR_MODE_P (mode));
13054   vmode = aarch64_simd_container_mode (mode, width);
13055   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
13056   return aarch64_output_simd_mov_immediate (v_op, vmode, width);
13057 }
13058
13059 /* Split operands into moves from op[1] + op[2] into op[0].  */
13060
13061 void
13062 aarch64_split_combinev16qi (rtx operands[3])
13063 {
13064   unsigned int dest = REGNO (operands[0]);
13065   unsigned int src1 = REGNO (operands[1]);
13066   unsigned int src2 = REGNO (operands[2]);
13067   machine_mode halfmode = GET_MODE (operands[1]);
13068   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
13069   rtx destlo, desthi;
13070
13071   gcc_assert (halfmode == V16QImode);
13072
13073   if (src1 == dest && src2 == dest + halfregs)
13074     {
13075       /* No-op move.  Can't split to nothing; emit something.  */
13076       emit_note (NOTE_INSN_DELETED);
13077       return;
13078     }
13079
13080   /* Preserve register attributes for variable tracking.  */
13081   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
13082   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
13083                                GET_MODE_SIZE (halfmode));
13084
13085   /* Special case of reversed high/low parts.  */
13086   if (reg_overlap_mentioned_p (operands[2], destlo)
13087       && reg_overlap_mentioned_p (operands[1], desthi))
13088     {
13089       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13090       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
13091       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
13092     }
13093   else if (!reg_overlap_mentioned_p (operands[2], destlo))
13094     {
13095       /* Try to avoid unnecessary moves if part of the result
13096          is in the right place already.  */
13097       if (src1 != dest)
13098         emit_move_insn (destlo, operands[1]);
13099       if (src2 != dest + halfregs)
13100         emit_move_insn (desthi, operands[2]);
13101     }
13102   else
13103     {
13104       if (src2 != dest + halfregs)
13105         emit_move_insn (desthi, operands[2]);
13106       if (src1 != dest)
13107         emit_move_insn (destlo, operands[1]);
13108     }
13109 }
13110
13111 /* vec_perm support.  */
13112
13113 #define MAX_VECT_LEN 16
13114
13115 struct expand_vec_perm_d
13116 {
13117   rtx target, op0, op1;
13118   unsigned char perm[MAX_VECT_LEN];
13119   machine_mode vmode;
13120   unsigned char nelt;
13121   bool one_vector_p;
13122   bool testing_p;
13123 };
13124
13125 /* Generate a variable permutation.  */
13126
13127 static void
13128 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
13129 {
13130   machine_mode vmode = GET_MODE (target);
13131   bool one_vector_p = rtx_equal_p (op0, op1);
13132
13133   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
13134   gcc_checking_assert (GET_MODE (op0) == vmode);
13135   gcc_checking_assert (GET_MODE (op1) == vmode);
13136   gcc_checking_assert (GET_MODE (sel) == vmode);
13137   gcc_checking_assert (TARGET_SIMD);
13138
13139   if (one_vector_p)
13140     {
13141       if (vmode == V8QImode)
13142         {
13143           /* Expand the argument to a V16QI mode by duplicating it.  */
13144           rtx pair = gen_reg_rtx (V16QImode);
13145           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
13146           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13147         }
13148       else
13149         {
13150           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
13151         }
13152     }
13153   else
13154     {
13155       rtx pair;
13156
13157       if (vmode == V8QImode)
13158         {
13159           pair = gen_reg_rtx (V16QImode);
13160           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
13161           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
13162         }
13163       else
13164         {
13165           pair = gen_reg_rtx (OImode);
13166           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
13167           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
13168         }
13169     }
13170 }
13171
13172 void
13173 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
13174 {
13175   machine_mode vmode = GET_MODE (target);
13176   unsigned int nelt = GET_MODE_NUNITS (vmode);
13177   bool one_vector_p = rtx_equal_p (op0, op1);
13178   rtx mask;
13179
13180   /* The TBL instruction does not use a modulo index, so we must take care
13181      of that ourselves.  */
13182   mask = aarch64_simd_gen_const_vector_dup (vmode,
13183       one_vector_p ? nelt - 1 : 2 * nelt - 1);
13184   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
13185
13186   /* For big-endian, we also need to reverse the index within the vector
13187      (but not which vector).  */
13188   if (BYTES_BIG_ENDIAN)
13189     {
13190       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
13191       if (!one_vector_p)
13192         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
13193       sel = expand_simple_binop (vmode, XOR, sel, mask,
13194                                  NULL, 0, OPTAB_LIB_WIDEN);
13195     }
13196   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
13197 }
13198
13199 /* Recognize patterns suitable for the TRN instructions.  */
13200 static bool
13201 aarch64_evpc_trn (struct expand_vec_perm_d *d)
13202 {
13203   unsigned int i, odd, mask, nelt = d->nelt;
13204   rtx out, in0, in1, x;
13205   rtx (*gen) (rtx, rtx, rtx);
13206   machine_mode vmode = d->vmode;
13207
13208   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13209     return false;
13210
13211   /* Note that these are little-endian tests.
13212      We correct for big-endian later.  */
13213   if (d->perm[0] == 0)
13214     odd = 0;
13215   else if (d->perm[0] == 1)
13216     odd = 1;
13217   else
13218     return false;
13219   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13220
13221   for (i = 0; i < nelt; i += 2)
13222     {
13223       if (d->perm[i] != i + odd)
13224         return false;
13225       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
13226         return false;
13227     }
13228
13229   /* Success!  */
13230   if (d->testing_p)
13231     return true;
13232
13233   in0 = d->op0;
13234   in1 = d->op1;
13235   if (BYTES_BIG_ENDIAN)
13236     {
13237       x = in0, in0 = in1, in1 = x;
13238       odd = !odd;
13239     }
13240   out = d->target;
13241
13242   if (odd)
13243     {
13244       switch (vmode)
13245         {
13246         case E_V16QImode: gen = gen_aarch64_trn2v16qi; break;
13247         case E_V8QImode: gen = gen_aarch64_trn2v8qi; break;
13248         case E_V8HImode: gen = gen_aarch64_trn2v8hi; break;
13249         case E_V4HImode: gen = gen_aarch64_trn2v4hi; break;
13250         case E_V4SImode: gen = gen_aarch64_trn2v4si; break;
13251         case E_V2SImode: gen = gen_aarch64_trn2v2si; break;
13252         case E_V2DImode: gen = gen_aarch64_trn2v2di; break;
13253         case E_V4HFmode: gen = gen_aarch64_trn2v4hf; break;
13254         case E_V8HFmode: gen = gen_aarch64_trn2v8hf; break;
13255         case E_V4SFmode: gen = gen_aarch64_trn2v4sf; break;
13256         case E_V2SFmode: gen = gen_aarch64_trn2v2sf; break;
13257         case E_V2DFmode: gen = gen_aarch64_trn2v2df; break;
13258         default:
13259           return false;
13260         }
13261     }
13262   else
13263     {
13264       switch (vmode)
13265         {
13266         case E_V16QImode: gen = gen_aarch64_trn1v16qi; break;
13267         case E_V8QImode: gen = gen_aarch64_trn1v8qi; break;
13268         case E_V8HImode: gen = gen_aarch64_trn1v8hi; break;
13269         case E_V4HImode: gen = gen_aarch64_trn1v4hi; break;
13270         case E_V4SImode: gen = gen_aarch64_trn1v4si; break;
13271         case E_V2SImode: gen = gen_aarch64_trn1v2si; break;
13272         case E_V2DImode: gen = gen_aarch64_trn1v2di; break;
13273         case E_V4HFmode: gen = gen_aarch64_trn1v4hf; break;
13274         case E_V8HFmode: gen = gen_aarch64_trn1v8hf; break;
13275         case E_V4SFmode: gen = gen_aarch64_trn1v4sf; break;
13276         case E_V2SFmode: gen = gen_aarch64_trn1v2sf; break;
13277         case E_V2DFmode: gen = gen_aarch64_trn1v2df; break;
13278         default:
13279           return false;
13280         }
13281     }
13282
13283   emit_insn (gen (out, in0, in1));
13284   return true;
13285 }
13286
13287 /* Recognize patterns suitable for the UZP instructions.  */
13288 static bool
13289 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
13290 {
13291   unsigned int i, odd, mask, nelt = d->nelt;
13292   rtx out, in0, in1, x;
13293   rtx (*gen) (rtx, rtx, rtx);
13294   machine_mode vmode = d->vmode;
13295
13296   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13297     return false;
13298
13299   /* Note that these are little-endian tests.
13300      We correct for big-endian later.  */
13301   if (d->perm[0] == 0)
13302     odd = 0;
13303   else if (d->perm[0] == 1)
13304     odd = 1;
13305   else
13306     return false;
13307   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13308
13309   for (i = 0; i < nelt; i++)
13310     {
13311       unsigned elt = (i * 2 + odd) & mask;
13312       if (d->perm[i] != elt)
13313         return false;
13314     }
13315
13316   /* Success!  */
13317   if (d->testing_p)
13318     return true;
13319
13320   in0 = d->op0;
13321   in1 = d->op1;
13322   if (BYTES_BIG_ENDIAN)
13323     {
13324       x = in0, in0 = in1, in1 = x;
13325       odd = !odd;
13326     }
13327   out = d->target;
13328
13329   if (odd)
13330     {
13331       switch (vmode)
13332         {
13333         case E_V16QImode: gen = gen_aarch64_uzp2v16qi; break;
13334         case E_V8QImode: gen = gen_aarch64_uzp2v8qi; break;
13335         case E_V8HImode: gen = gen_aarch64_uzp2v8hi; break;
13336         case E_V4HImode: gen = gen_aarch64_uzp2v4hi; break;
13337         case E_V4SImode: gen = gen_aarch64_uzp2v4si; break;
13338         case E_V2SImode: gen = gen_aarch64_uzp2v2si; break;
13339         case E_V2DImode: gen = gen_aarch64_uzp2v2di; break;
13340         case E_V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
13341         case E_V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
13342         case E_V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
13343         case E_V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
13344         case E_V2DFmode: gen = gen_aarch64_uzp2v2df; break;
13345         default:
13346           return false;
13347         }
13348     }
13349   else
13350     {
13351       switch (vmode)
13352         {
13353         case E_V16QImode: gen = gen_aarch64_uzp1v16qi; break;
13354         case E_V8QImode: gen = gen_aarch64_uzp1v8qi; break;
13355         case E_V8HImode: gen = gen_aarch64_uzp1v8hi; break;
13356         case E_V4HImode: gen = gen_aarch64_uzp1v4hi; break;
13357         case E_V4SImode: gen = gen_aarch64_uzp1v4si; break;
13358         case E_V2SImode: gen = gen_aarch64_uzp1v2si; break;
13359         case E_V2DImode: gen = gen_aarch64_uzp1v2di; break;
13360         case E_V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
13361         case E_V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
13362         case E_V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
13363         case E_V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
13364         case E_V2DFmode: gen = gen_aarch64_uzp1v2df; break;
13365         default:
13366           return false;
13367         }
13368     }
13369
13370   emit_insn (gen (out, in0, in1));
13371   return true;
13372 }
13373
13374 /* Recognize patterns suitable for the ZIP instructions.  */
13375 static bool
13376 aarch64_evpc_zip (struct expand_vec_perm_d *d)
13377 {
13378   unsigned int i, high, mask, nelt = d->nelt;
13379   rtx out, in0, in1, x;
13380   rtx (*gen) (rtx, rtx, rtx);
13381   machine_mode vmode = d->vmode;
13382
13383   if (GET_MODE_UNIT_SIZE (vmode) > 8)
13384     return false;
13385
13386   /* Note that these are little-endian tests.
13387      We correct for big-endian later.  */
13388   high = nelt / 2;
13389   if (d->perm[0] == high)
13390     /* Do Nothing.  */
13391     ;
13392   else if (d->perm[0] == 0)
13393     high = 0;
13394   else
13395     return false;
13396   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13397
13398   for (i = 0; i < nelt / 2; i++)
13399     {
13400       unsigned elt = (i + high) & mask;
13401       if (d->perm[i * 2] != elt)
13402         return false;
13403       elt = (elt + nelt) & mask;
13404       if (d->perm[i * 2 + 1] != elt)
13405         return false;
13406     }
13407
13408   /* Success!  */
13409   if (d->testing_p)
13410     return true;
13411
13412   in0 = d->op0;
13413   in1 = d->op1;
13414   if (BYTES_BIG_ENDIAN)
13415     {
13416       x = in0, in0 = in1, in1 = x;
13417       high = !high;
13418     }
13419   out = d->target;
13420
13421   if (high)
13422     {
13423       switch (vmode)
13424         {
13425         case E_V16QImode: gen = gen_aarch64_zip2v16qi; break;
13426         case E_V8QImode: gen = gen_aarch64_zip2v8qi; break;
13427         case E_V8HImode: gen = gen_aarch64_zip2v8hi; break;
13428         case E_V4HImode: gen = gen_aarch64_zip2v4hi; break;
13429         case E_V4SImode: gen = gen_aarch64_zip2v4si; break;
13430         case E_V2SImode: gen = gen_aarch64_zip2v2si; break;
13431         case E_V2DImode: gen = gen_aarch64_zip2v2di; break;
13432         case E_V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13433         case E_V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13434         case E_V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13435         case E_V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13436         case E_V2DFmode: gen = gen_aarch64_zip2v2df; break;
13437         default:
13438           return false;
13439         }
13440     }
13441   else
13442     {
13443       switch (vmode)
13444         {
13445         case E_V16QImode: gen = gen_aarch64_zip1v16qi; break;
13446         case E_V8QImode: gen = gen_aarch64_zip1v8qi; break;
13447         case E_V8HImode: gen = gen_aarch64_zip1v8hi; break;
13448         case E_V4HImode: gen = gen_aarch64_zip1v4hi; break;
13449         case E_V4SImode: gen = gen_aarch64_zip1v4si; break;
13450         case E_V2SImode: gen = gen_aarch64_zip1v2si; break;
13451         case E_V2DImode: gen = gen_aarch64_zip1v2di; break;
13452         case E_V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13453         case E_V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13454         case E_V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13455         case E_V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13456         case E_V2DFmode: gen = gen_aarch64_zip1v2df; break;
13457         default:
13458           return false;
13459         }
13460     }
13461
13462   emit_insn (gen (out, in0, in1));
13463   return true;
13464 }
13465
13466 /* Recognize patterns for the EXT insn.  */
13467
13468 static bool
13469 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13470 {
13471   unsigned int i, nelt = d->nelt;
13472   rtx (*gen) (rtx, rtx, rtx, rtx);
13473   rtx offset;
13474
13475   unsigned int location = d->perm[0]; /* Always < nelt.  */
13476
13477   /* Check if the extracted indices are increasing by one.  */
13478   for (i = 1; i < nelt; i++)
13479     {
13480       unsigned int required = location + i;
13481       if (d->one_vector_p)
13482         {
13483           /* We'll pass the same vector in twice, so allow indices to wrap.  */
13484           required &= (nelt - 1);
13485         }
13486       if (d->perm[i] != required)
13487         return false;
13488     }
13489
13490   switch (d->vmode)
13491     {
13492     case E_V16QImode: gen = gen_aarch64_extv16qi; break;
13493     case E_V8QImode: gen = gen_aarch64_extv8qi; break;
13494     case E_V4HImode: gen = gen_aarch64_extv4hi; break;
13495     case E_V8HImode: gen = gen_aarch64_extv8hi; break;
13496     case E_V2SImode: gen = gen_aarch64_extv2si; break;
13497     case E_V4SImode: gen = gen_aarch64_extv4si; break;
13498     case E_V4HFmode: gen = gen_aarch64_extv4hf; break;
13499     case E_V8HFmode: gen = gen_aarch64_extv8hf; break;
13500     case E_V2SFmode: gen = gen_aarch64_extv2sf; break;
13501     case E_V4SFmode: gen = gen_aarch64_extv4sf; break;
13502     case E_V2DImode: gen = gen_aarch64_extv2di; break;
13503     case E_V2DFmode: gen = gen_aarch64_extv2df; break;
13504     default:
13505       return false;
13506     }
13507
13508   /* Success! */
13509   if (d->testing_p)
13510     return true;
13511
13512   /* The case where (location == 0) is a no-op for both big- and little-endian,
13513      and is removed by the mid-end at optimization levels -O1 and higher.  */
13514
13515   if (BYTES_BIG_ENDIAN && (location != 0))
13516     {
13517       /* After setup, we want the high elements of the first vector (stored
13518          at the LSB end of the register), and the low elements of the second
13519          vector (stored at the MSB end of the register). So swap.  */
13520       std::swap (d->op0, d->op1);
13521       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
13522       location = nelt - location;
13523     }
13524
13525   offset = GEN_INT (location);
13526   emit_insn (gen (d->target, d->op0, d->op1, offset));
13527   return true;
13528 }
13529
13530 /* Recognize patterns for the REV insns.  */
13531
13532 static bool
13533 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13534 {
13535   unsigned int i, j, diff, nelt = d->nelt;
13536   rtx (*gen) (rtx, rtx);
13537
13538   if (!d->one_vector_p)
13539     return false;
13540
13541   diff = d->perm[0];
13542   switch (diff)
13543     {
13544     case 7:
13545       switch (d->vmode)
13546         {
13547         case E_V16QImode: gen = gen_aarch64_rev64v16qi; break;
13548         case E_V8QImode: gen = gen_aarch64_rev64v8qi;  break;
13549         default:
13550           return false;
13551         }
13552       break;
13553     case 3:
13554       switch (d->vmode)
13555         {
13556         case E_V16QImode: gen = gen_aarch64_rev32v16qi; break;
13557         case E_V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13558         case E_V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13559         case E_V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13560         default:
13561           return false;
13562         }
13563       break;
13564     case 1:
13565       switch (d->vmode)
13566         {
13567         case E_V16QImode: gen = gen_aarch64_rev16v16qi; break;
13568         case E_V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13569         case E_V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13570         case E_V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13571         case E_V4SImode: gen = gen_aarch64_rev64v4si;  break;
13572         case E_V2SImode: gen = gen_aarch64_rev64v2si;  break;
13573         case E_V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13574         case E_V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13575         case E_V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13576         case E_V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13577         default:
13578           return false;
13579         }
13580       break;
13581     default:
13582       return false;
13583     }
13584
13585   for (i = 0; i < nelt ; i += diff + 1)
13586     for (j = 0; j <= diff; j += 1)
13587       {
13588         /* This is guaranteed to be true as the value of diff
13589            is 7, 3, 1 and we should have enough elements in the
13590            queue to generate this.  Getting a vector mask with a
13591            value of diff other than these values implies that
13592            something is wrong by the time we get here.  */
13593         gcc_assert (i + j < nelt);
13594         if (d->perm[i + j] != i + diff - j)
13595           return false;
13596       }
13597
13598   /* Success! */
13599   if (d->testing_p)
13600     return true;
13601
13602   emit_insn (gen (d->target, d->op0));
13603   return true;
13604 }
13605
13606 static bool
13607 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13608 {
13609   rtx (*gen) (rtx, rtx, rtx);
13610   rtx out = d->target;
13611   rtx in0;
13612   machine_mode vmode = d->vmode;
13613   unsigned int i, elt, nelt = d->nelt;
13614   rtx lane;
13615
13616   elt = d->perm[0];
13617   for (i = 1; i < nelt; i++)
13618     {
13619       if (elt != d->perm[i])
13620         return false;
13621     }
13622
13623   /* The generic preparation in aarch64_expand_vec_perm_const_1
13624      swaps the operand order and the permute indices if it finds
13625      d->perm[0] to be in the second operand.  Thus, we can always
13626      use d->op0 and need not do any extra arithmetic to get the
13627      correct lane number.  */
13628   in0 = d->op0;
13629   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13630
13631   switch (vmode)
13632     {
13633     case E_V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13634     case E_V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13635     case E_V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13636     case E_V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13637     case E_V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13638     case E_V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13639     case E_V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13640     case E_V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13641     case E_V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13642     case E_V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13643     case E_V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13644     case E_V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13645     default:
13646       return false;
13647     }
13648
13649   emit_insn (gen (out, in0, lane));
13650   return true;
13651 }
13652
13653 static bool
13654 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13655 {
13656   rtx rperm[MAX_VECT_LEN], sel;
13657   machine_mode vmode = d->vmode;
13658   unsigned int i, nelt = d->nelt;
13659
13660   if (d->testing_p)
13661     return true;
13662
13663   /* Generic code will try constant permutation twice.  Once with the
13664      original mode and again with the elements lowered to QImode.
13665      So wait and don't do the selector expansion ourselves.  */
13666   if (vmode != V8QImode && vmode != V16QImode)
13667     return false;
13668
13669   for (i = 0; i < nelt; ++i)
13670     {
13671       int nunits = GET_MODE_NUNITS (vmode);
13672
13673       /* If big-endian and two vectors we end up with a weird mixed-endian
13674          mode on NEON.  Reverse the index within each word but not the word
13675          itself.  */
13676       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13677                                            : d->perm[i]);
13678     }
13679   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13680   sel = force_reg (vmode, sel);
13681
13682   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13683   return true;
13684 }
13685
13686 static bool
13687 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13688 {
13689   /* The pattern matching functions above are written to look for a small
13690      number to begin the sequence (0, 1, N/2).  If we begin with an index
13691      from the second operand, we can swap the operands.  */
13692   if (d->perm[0] >= d->nelt)
13693     {
13694       unsigned i, nelt = d->nelt;
13695
13696       gcc_assert (nelt == (nelt & -nelt));
13697       for (i = 0; i < nelt; ++i)
13698         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13699
13700       std::swap (d->op0, d->op1);
13701     }
13702
13703   if (TARGET_SIMD)
13704     {
13705       if (aarch64_evpc_rev (d))
13706         return true;
13707       else if (aarch64_evpc_ext (d))
13708         return true;
13709       else if (aarch64_evpc_dup (d))
13710         return true;
13711       else if (aarch64_evpc_zip (d))
13712         return true;
13713       else if (aarch64_evpc_uzp (d))
13714         return true;
13715       else if (aarch64_evpc_trn (d))
13716         return true;
13717       return aarch64_evpc_tbl (d);
13718     }
13719   return false;
13720 }
13721
13722 /* Expand a vec_perm_const pattern.  */
13723
13724 bool
13725 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13726 {
13727   struct expand_vec_perm_d d;
13728   int i, nelt, which;
13729
13730   d.target = target;
13731   d.op0 = op0;
13732   d.op1 = op1;
13733
13734   d.vmode = GET_MODE (target);
13735   gcc_assert (VECTOR_MODE_P (d.vmode));
13736   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13737   d.testing_p = false;
13738
13739   for (i = which = 0; i < nelt; ++i)
13740     {
13741       rtx e = XVECEXP (sel, 0, i);
13742       int ei = INTVAL (e) & (2 * nelt - 1);
13743       which |= (ei < nelt ? 1 : 2);
13744       d.perm[i] = ei;
13745     }
13746
13747   switch (which)
13748     {
13749     default:
13750       gcc_unreachable ();
13751
13752     case 3:
13753       d.one_vector_p = false;
13754       if (!rtx_equal_p (op0, op1))
13755         break;
13756
13757       /* The elements of PERM do not suggest that only the first operand
13758          is used, but both operands are identical.  Allow easier matching
13759          of the permutation by folding the permutation into the single
13760          input vector.  */
13761       /* Fall Through.  */
13762     case 2:
13763       for (i = 0; i < nelt; ++i)
13764         d.perm[i] &= nelt - 1;
13765       d.op0 = op1;
13766       d.one_vector_p = true;
13767       break;
13768
13769     case 1:
13770       d.op1 = op0;
13771       d.one_vector_p = true;
13772       break;
13773     }
13774
13775   return aarch64_expand_vec_perm_const_1 (&d);
13776 }
13777
13778 static bool
13779 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13780                                      const unsigned char *sel)
13781 {
13782   struct expand_vec_perm_d d;
13783   unsigned int i, nelt, which;
13784   bool ret;
13785
13786   d.vmode = vmode;
13787   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13788   d.testing_p = true;
13789   memcpy (d.perm, sel, nelt);
13790
13791   /* Calculate whether all elements are in one vector.  */
13792   for (i = which = 0; i < nelt; ++i)
13793     {
13794       unsigned char e = d.perm[i];
13795       gcc_assert (e < 2 * nelt);
13796       which |= (e < nelt ? 1 : 2);
13797     }
13798
13799   /* If all elements are from the second vector, reindex as if from the
13800      first vector.  */
13801   if (which == 2)
13802     for (i = 0; i < nelt; ++i)
13803       d.perm[i] -= nelt;
13804
13805   /* Check whether the mask can be applied to a single vector.  */
13806   d.one_vector_p = (which != 3);
13807
13808   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13809   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13810   if (!d.one_vector_p)
13811     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13812
13813   start_sequence ();
13814   ret = aarch64_expand_vec_perm_const_1 (&d);
13815   end_sequence ();
13816
13817   return ret;
13818 }
13819
13820 rtx
13821 aarch64_reverse_mask (machine_mode mode)
13822 {
13823   /* We have to reverse each vector because we dont have
13824      a permuted load that can reverse-load according to ABI rules.  */
13825   rtx mask;
13826   rtvec v = rtvec_alloc (16);
13827   int i, j;
13828   int nunits = GET_MODE_NUNITS (mode);
13829   int usize = GET_MODE_UNIT_SIZE (mode);
13830
13831   gcc_assert (BYTES_BIG_ENDIAN);
13832   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13833
13834   for (i = 0; i < nunits; i++)
13835     for (j = 0; j < usize; j++)
13836       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13837   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13838   return force_reg (V16QImode, mask);
13839 }
13840
13841 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13842    However due to issues with register allocation it is preferable to avoid
13843    tieing integer scalar and FP scalar modes.  Executing integer operations
13844    in general registers is better than treating them as scalar vector
13845    operations.  This reduces latency and avoids redundant int<->FP moves.
13846    So tie modes if they are either the same class, or vector modes with
13847    other vector modes, vector structs or any scalar mode.
13848 */
13849
13850 bool
13851 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13852 {
13853   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13854     return true;
13855
13856   /* We specifically want to allow elements of "structure" modes to
13857      be tieable to the structure.  This more general condition allows
13858      other rarer situations too.  */
13859   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13860     return true;
13861
13862   /* Also allow any scalar modes with vectors.  */
13863   if (aarch64_vector_mode_supported_p (mode1)
13864       || aarch64_vector_mode_supported_p (mode2))
13865     return true;
13866
13867   return false;
13868 }
13869
13870 /* Return a new RTX holding the result of moving POINTER forward by
13871    AMOUNT bytes.  */
13872
13873 static rtx
13874 aarch64_move_pointer (rtx pointer, int amount)
13875 {
13876   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13877
13878   return adjust_automodify_address (pointer, GET_MODE (pointer),
13879                                     next, amount);
13880 }
13881
13882 /* Return a new RTX holding the result of moving POINTER forward by the
13883    size of the mode it points to.  */
13884
13885 static rtx
13886 aarch64_progress_pointer (rtx pointer)
13887 {
13888   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13889
13890   return aarch64_move_pointer (pointer, amount);
13891 }
13892
13893 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13894    MODE bytes.  */
13895
13896 static void
13897 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13898                                               machine_mode mode)
13899 {
13900   rtx reg = gen_reg_rtx (mode);
13901
13902   /* "Cast" the pointers to the correct mode.  */
13903   *src = adjust_address (*src, mode, 0);
13904   *dst = adjust_address (*dst, mode, 0);
13905   /* Emit the memcpy.  */
13906   emit_move_insn (reg, *src);
13907   emit_move_insn (*dst, reg);
13908   /* Move the pointers forward.  */
13909   *src = aarch64_progress_pointer (*src);
13910   *dst = aarch64_progress_pointer (*dst);
13911 }
13912
13913 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13914    we succeed, otherwise return false.  */
13915
13916 bool
13917 aarch64_expand_movmem (rtx *operands)
13918 {
13919   unsigned int n;
13920   rtx dst = operands[0];
13921   rtx src = operands[1];
13922   rtx base;
13923   bool speed_p = !optimize_function_for_size_p (cfun);
13924
13925   /* When optimizing for size, give a better estimate of the length of a
13926      memcpy call, but use the default otherwise.  */
13927   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13928
13929   /* We can't do anything smart if the amount to copy is not constant.  */
13930   if (!CONST_INT_P (operands[2]))
13931     return false;
13932
13933   n = UINTVAL (operands[2]);
13934
13935   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13936      need to make at most two moves.  For cases above 16 bytes it will be one
13937      move for each 16 byte chunk, then at most two additional moves.  */
13938   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13939     return false;
13940
13941   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13942   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13943
13944   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13945   src = adjust_automodify_address (src, VOIDmode, base, 0);
13946
13947   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13948      1-byte chunk.  */
13949   if (n < 4)
13950     {
13951       if (n >= 2)
13952         {
13953           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13954           n -= 2;
13955         }
13956
13957       if (n == 1)
13958         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13959
13960       return true;
13961     }
13962
13963   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13964      4-byte chunk, partially overlapping with the previously copied chunk.  */
13965   if (n < 8)
13966     {
13967       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13968       n -= 4;
13969       if (n > 0)
13970         {
13971           int move = n - 4;
13972
13973           src = aarch64_move_pointer (src, move);
13974           dst = aarch64_move_pointer (dst, move);
13975           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13976         }
13977       return true;
13978     }
13979
13980   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13981      them, then (if applicable) an 8-byte chunk.  */
13982   while (n >= 8)
13983     {
13984       if (n / 16)
13985         {
13986           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13987           n -= 16;
13988         }
13989       else
13990         {
13991           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13992           n -= 8;
13993         }
13994     }
13995
13996   /* Finish the final bytes of the copy.  We can always do this in one
13997      instruction.  We either copy the exact amount we need, or partially
13998      overlap with the previous chunk we copied and copy 8-bytes.  */
13999   if (n == 0)
14000     return true;
14001   else if (n == 1)
14002     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
14003   else if (n == 2)
14004     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
14005   else if (n == 4)
14006     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14007   else
14008     {
14009       if (n == 3)
14010         {
14011           src = aarch64_move_pointer (src, -1);
14012           dst = aarch64_move_pointer (dst, -1);
14013           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
14014         }
14015       else
14016         {
14017           int move = n - 8;
14018
14019           src = aarch64_move_pointer (src, move);
14020           dst = aarch64_move_pointer (dst, move);
14021           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
14022         }
14023     }
14024
14025   return true;
14026 }
14027
14028 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14029    SImode stores.  Handle the case when the constant has identical
14030    bottom and top halves.  This is beneficial when the two stores can be
14031    merged into an STP and we avoid synthesising potentially expensive
14032    immediates twice.  Return true if such a split is possible.  */
14033
14034 bool
14035 aarch64_split_dimode_const_store (rtx dst, rtx src)
14036 {
14037   rtx lo = gen_lowpart (SImode, src);
14038   rtx hi = gen_highpart_mode (SImode, DImode, src);
14039
14040   bool size_p = optimize_function_for_size_p (cfun);
14041
14042   if (!rtx_equal_p (lo, hi))
14043     return false;
14044
14045   unsigned int orig_cost
14046     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
14047   unsigned int lo_cost
14048     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
14049
14050   /* We want to transform:
14051      MOV        x1, 49370
14052      MOVK       x1, 0x140, lsl 16
14053      MOVK       x1, 0xc0da, lsl 32
14054      MOVK       x1, 0x140, lsl 48
14055      STR        x1, [x0]
14056    into:
14057      MOV        w1, 49370
14058      MOVK       w1, 0x140, lsl 16
14059      STP        w1, w1, [x0]
14060    So we want to perform this only when we save two instructions
14061    or more.  When optimizing for size, however, accept any code size
14062    savings we can.  */
14063   if (size_p && orig_cost <= lo_cost)
14064     return false;
14065
14066   if (!size_p
14067       && (orig_cost <= lo_cost + 1))
14068     return false;
14069
14070   rtx mem_lo = adjust_address (dst, SImode, 0);
14071   if (!aarch64_mem_pair_operand (mem_lo, SImode))
14072     return false;
14073
14074   rtx tmp_reg = gen_reg_rtx (SImode);
14075   aarch64_expand_mov_immediate (tmp_reg, lo);
14076   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
14077   /* Don't emit an explicit store pair as this may not be always profitable.
14078      Let the sched-fusion logic decide whether to merge them.  */
14079   emit_move_insn (mem_lo, tmp_reg);
14080   emit_move_insn (mem_hi, tmp_reg);
14081
14082   return true;
14083 }
14084
14085 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
14086
14087 static unsigned HOST_WIDE_INT
14088 aarch64_asan_shadow_offset (void)
14089 {
14090   return (HOST_WIDE_INT_1 << 36);
14091 }
14092
14093 static bool
14094 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
14095                                         unsigned int align,
14096                                         enum by_pieces_operation op,
14097                                         bool speed_p)
14098 {
14099   /* STORE_BY_PIECES can be used when copying a constant string, but
14100      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14101      For now we always fail this and let the move_by_pieces code copy
14102      the string from read-only memory.  */
14103   if (op == STORE_BY_PIECES)
14104     return false;
14105
14106   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
14107 }
14108
14109 static rtx
14110 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
14111                         int code, tree treeop0, tree treeop1)
14112 {
14113   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14114   rtx op0, op1;
14115   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14116   insn_code icode;
14117   struct expand_operand ops[4];
14118
14119   start_sequence ();
14120   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14121
14122   op_mode = GET_MODE (op0);
14123   if (op_mode == VOIDmode)
14124     op_mode = GET_MODE (op1);
14125
14126   switch (op_mode)
14127     {
14128     case E_QImode:
14129     case E_HImode:
14130     case E_SImode:
14131       cmp_mode = SImode;
14132       icode = CODE_FOR_cmpsi;
14133       break;
14134
14135     case E_DImode:
14136       cmp_mode = DImode;
14137       icode = CODE_FOR_cmpdi;
14138       break;
14139
14140     case E_SFmode:
14141       cmp_mode = SFmode;
14142       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14143       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
14144       break;
14145
14146     case E_DFmode:
14147       cmp_mode = DFmode;
14148       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
14149       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
14150       break;
14151
14152     default:
14153       end_sequence ();
14154       return NULL_RTX;
14155     }
14156
14157   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
14158   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
14159   if (!op0 || !op1)
14160     {
14161       end_sequence ();
14162       return NULL_RTX;
14163     }
14164   *prep_seq = get_insns ();
14165   end_sequence ();
14166
14167   create_fixed_operand (&ops[0], op0);
14168   create_fixed_operand (&ops[1], op1);
14169
14170   start_sequence ();
14171   if (!maybe_expand_insn (icode, 2, ops))
14172     {
14173       end_sequence ();
14174       return NULL_RTX;
14175     }
14176   *gen_seq = get_insns ();
14177   end_sequence ();
14178
14179   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
14180                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
14181 }
14182
14183 static rtx
14184 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
14185                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
14186 {
14187   rtx op0, op1, target;
14188   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
14189   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
14190   insn_code icode;
14191   struct expand_operand ops[6];
14192   int aarch64_cond;
14193
14194   push_to_sequence (*prep_seq);
14195   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
14196
14197   op_mode = GET_MODE (op0);
14198   if (op_mode == VOIDmode)
14199     op_mode = GET_MODE (op1);
14200
14201   switch (op_mode)
14202     {
14203     case E_QImode:
14204     case E_HImode:
14205     case E_SImode:
14206       cmp_mode = SImode;
14207       icode = CODE_FOR_ccmpsi;
14208       break;
14209
14210     case E_DImode:
14211       cmp_mode = DImode;
14212       icode = CODE_FOR_ccmpdi;
14213       break;
14214
14215     case E_SFmode:
14216       cmp_mode = SFmode;
14217       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14218       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
14219       break;
14220
14221     case E_DFmode:
14222       cmp_mode = DFmode;
14223       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
14224       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
14225       break;
14226
14227     default:
14228       end_sequence ();
14229       return NULL_RTX;
14230     }
14231
14232   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
14233   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
14234   if (!op0 || !op1)
14235     {
14236       end_sequence ();
14237       return NULL_RTX;
14238     }
14239   *prep_seq = get_insns ();
14240   end_sequence ();
14241
14242   target = gen_rtx_REG (cc_mode, CC_REGNUM);
14243   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
14244
14245   if (bit_code != AND)
14246     {
14247       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
14248                                                 GET_MODE (XEXP (prev, 0))),
14249                              VOIDmode, XEXP (prev, 0), const0_rtx);
14250       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
14251     }
14252
14253   create_fixed_operand (&ops[0], XEXP (prev, 0));
14254   create_fixed_operand (&ops[1], target);
14255   create_fixed_operand (&ops[2], op0);
14256   create_fixed_operand (&ops[3], op1);
14257   create_fixed_operand (&ops[4], prev);
14258   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
14259
14260   push_to_sequence (*gen_seq);
14261   if (!maybe_expand_insn (icode, 6, ops))
14262     {
14263       end_sequence ();
14264       return NULL_RTX;
14265     }
14266
14267   *gen_seq = get_insns ();
14268   end_sequence ();
14269
14270   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
14271 }
14272
14273 #undef TARGET_GEN_CCMP_FIRST
14274 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14275
14276 #undef TARGET_GEN_CCMP_NEXT
14277 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14278
14279 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
14280    instruction fusion of some sort.  */
14281
14282 static bool
14283 aarch64_macro_fusion_p (void)
14284 {
14285   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
14286 }
14287
14288
14289 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
14290    should be kept together during scheduling.  */
14291
14292 static bool
14293 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
14294 {
14295   rtx set_dest;
14296   rtx prev_set = single_set (prev);
14297   rtx curr_set = single_set (curr);
14298   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
14299   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
14300
14301   if (!aarch64_macro_fusion_p ())
14302     return false;
14303
14304   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
14305     {
14306       /* We are trying to match:
14307          prev (mov)  == (set (reg r0) (const_int imm16))
14308          curr (movk) == (set (zero_extract (reg r0)
14309                                            (const_int 16)
14310                                            (const_int 16))
14311                              (const_int imm16_1))  */
14312
14313       set_dest = SET_DEST (curr_set);
14314
14315       if (GET_CODE (set_dest) == ZERO_EXTRACT
14316           && CONST_INT_P (SET_SRC (curr_set))
14317           && CONST_INT_P (SET_SRC (prev_set))
14318           && CONST_INT_P (XEXP (set_dest, 2))
14319           && INTVAL (XEXP (set_dest, 2)) == 16
14320           && REG_P (XEXP (set_dest, 0))
14321           && REG_P (SET_DEST (prev_set))
14322           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
14323         {
14324           return true;
14325         }
14326     }
14327
14328   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
14329     {
14330
14331       /*  We're trying to match:
14332           prev (adrp) == (set (reg r1)
14333                               (high (symbol_ref ("SYM"))))
14334           curr (add) == (set (reg r0)
14335                              (lo_sum (reg r1)
14336                                      (symbol_ref ("SYM"))))
14337           Note that r0 need not necessarily be the same as r1, especially
14338           during pre-regalloc scheduling.  */
14339
14340       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14341           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14342         {
14343           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
14344               && REG_P (XEXP (SET_SRC (curr_set), 0))
14345               && REGNO (XEXP (SET_SRC (curr_set), 0))
14346                  == REGNO (SET_DEST (prev_set))
14347               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
14348                               XEXP (SET_SRC (curr_set), 1)))
14349             return true;
14350         }
14351     }
14352
14353   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
14354     {
14355
14356       /* We're trying to match:
14357          prev (movk) == (set (zero_extract (reg r0)
14358                                            (const_int 16)
14359                                            (const_int 32))
14360                              (const_int imm16_1))
14361          curr (movk) == (set (zero_extract (reg r0)
14362                                            (const_int 16)
14363                                            (const_int 48))
14364                              (const_int imm16_2))  */
14365
14366       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
14367           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
14368           && REG_P (XEXP (SET_DEST (prev_set), 0))
14369           && REG_P (XEXP (SET_DEST (curr_set), 0))
14370           && REGNO (XEXP (SET_DEST (prev_set), 0))
14371              == REGNO (XEXP (SET_DEST (curr_set), 0))
14372           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
14373           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
14374           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
14375           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
14376           && CONST_INT_P (SET_SRC (prev_set))
14377           && CONST_INT_P (SET_SRC (curr_set)))
14378         return true;
14379
14380     }
14381   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
14382     {
14383       /* We're trying to match:
14384           prev (adrp) == (set (reg r0)
14385                               (high (symbol_ref ("SYM"))))
14386           curr (ldr) == (set (reg r1)
14387                              (mem (lo_sum (reg r0)
14388                                              (symbol_ref ("SYM")))))
14389                  or
14390           curr (ldr) == (set (reg r1)
14391                              (zero_extend (mem
14392                                            (lo_sum (reg r0)
14393                                                    (symbol_ref ("SYM"))))))  */
14394       if (satisfies_constraint_Ush (SET_SRC (prev_set))
14395           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14396         {
14397           rtx curr_src = SET_SRC (curr_set);
14398
14399           if (GET_CODE (curr_src) == ZERO_EXTEND)
14400             curr_src = XEXP (curr_src, 0);
14401
14402           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14403               && REG_P (XEXP (XEXP (curr_src, 0), 0))
14404               && REGNO (XEXP (XEXP (curr_src, 0), 0))
14405                  == REGNO (SET_DEST (prev_set))
14406               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14407                               XEXP (SET_SRC (prev_set), 0)))
14408               return true;
14409         }
14410     }
14411
14412   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14413        && aarch_crypto_can_dual_issue (prev, curr))
14414     return true;
14415
14416   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14417       && any_condjump_p (curr))
14418     {
14419       enum attr_type prev_type = get_attr_type (prev);
14420
14421       unsigned int condreg1, condreg2;
14422       rtx cc_reg_1;
14423       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
14424       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
14425
14426       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
14427           && prev
14428           && modified_in_p (cc_reg_1, prev))
14429         {
14430           /* FIXME: this misses some which is considered simple arthematic
14431              instructions for ThunderX.  Simple shifts are missed here.  */
14432           if (prev_type == TYPE_ALUS_SREG
14433               || prev_type == TYPE_ALUS_IMM
14434               || prev_type == TYPE_LOGICS_REG
14435               || prev_type == TYPE_LOGICS_IMM)
14436             return true;
14437         }
14438     }
14439
14440   if (prev_set
14441       && curr_set
14442       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
14443       && any_condjump_p (curr))
14444     {
14445       /* We're trying to match:
14446           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14447           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
14448                                                          (const_int 0))
14449                                                  (label_ref ("SYM"))
14450                                                  (pc))  */
14451       if (SET_DEST (curr_set) == (pc_rtx)
14452           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
14453           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
14454           && REG_P (SET_DEST (prev_set))
14455           && REGNO (SET_DEST (prev_set))
14456              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
14457         {
14458           /* Fuse ALU operations followed by conditional branch instruction.  */
14459           switch (get_attr_type (prev))
14460             {
14461             case TYPE_ALU_IMM:
14462             case TYPE_ALU_SREG:
14463             case TYPE_ADC_REG:
14464             case TYPE_ADC_IMM:
14465             case TYPE_ADCS_REG:
14466             case TYPE_ADCS_IMM:
14467             case TYPE_LOGIC_REG:
14468             case TYPE_LOGIC_IMM:
14469             case TYPE_CSEL:
14470             case TYPE_ADR:
14471             case TYPE_MOV_IMM:
14472             case TYPE_SHIFT_REG:
14473             case TYPE_SHIFT_IMM:
14474             case TYPE_BFM:
14475             case TYPE_RBIT:
14476             case TYPE_REV:
14477             case TYPE_EXTEND:
14478               return true;
14479
14480             default:;
14481             }
14482         }
14483     }
14484
14485   return false;
14486 }
14487
14488 /* Return true iff the instruction fusion described by OP is enabled.  */
14489
14490 bool
14491 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14492 {
14493   return (aarch64_tune_params.fusible_ops & op) != 0;
14494 }
14495
14496 /* If MEM is in the form of [base+offset], extract the two parts
14497    of address and set to BASE and OFFSET, otherwise return false
14498    after clearing BASE and OFFSET.  */
14499
14500 bool
14501 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14502 {
14503   rtx addr;
14504
14505   gcc_assert (MEM_P (mem));
14506
14507   addr = XEXP (mem, 0);
14508
14509   if (REG_P (addr))
14510     {
14511       *base = addr;
14512       *offset = const0_rtx;
14513       return true;
14514     }
14515
14516   if (GET_CODE (addr) == PLUS
14517       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14518     {
14519       *base = XEXP (addr, 0);
14520       *offset = XEXP (addr, 1);
14521       return true;
14522     }
14523
14524   *base = NULL_RTX;
14525   *offset = NULL_RTX;
14526
14527   return false;
14528 }
14529
14530 /* Types for scheduling fusion.  */
14531 enum sched_fusion_type
14532 {
14533   SCHED_FUSION_NONE = 0,
14534   SCHED_FUSION_LD_SIGN_EXTEND,
14535   SCHED_FUSION_LD_ZERO_EXTEND,
14536   SCHED_FUSION_LD,
14537   SCHED_FUSION_ST,
14538   SCHED_FUSION_NUM
14539 };
14540
14541 /* If INSN is a load or store of address in the form of [base+offset],
14542    extract the two parts and set to BASE and OFFSET.  Return scheduling
14543    fusion type this INSN is.  */
14544
14545 static enum sched_fusion_type
14546 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14547 {
14548   rtx x, dest, src;
14549   enum sched_fusion_type fusion = SCHED_FUSION_LD;
14550
14551   gcc_assert (INSN_P (insn));
14552   x = PATTERN (insn);
14553   if (GET_CODE (x) != SET)
14554     return SCHED_FUSION_NONE;
14555
14556   src = SET_SRC (x);
14557   dest = SET_DEST (x);
14558
14559   machine_mode dest_mode = GET_MODE (dest);
14560
14561   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14562     return SCHED_FUSION_NONE;
14563
14564   if (GET_CODE (src) == SIGN_EXTEND)
14565     {
14566       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14567       src = XEXP (src, 0);
14568       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14569         return SCHED_FUSION_NONE;
14570     }
14571   else if (GET_CODE (src) == ZERO_EXTEND)
14572     {
14573       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14574       src = XEXP (src, 0);
14575       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14576         return SCHED_FUSION_NONE;
14577     }
14578
14579   if (GET_CODE (src) == MEM && REG_P (dest))
14580     extract_base_offset_in_addr (src, base, offset);
14581   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14582     {
14583       fusion = SCHED_FUSION_ST;
14584       extract_base_offset_in_addr (dest, base, offset);
14585     }
14586   else
14587     return SCHED_FUSION_NONE;
14588
14589   if (*base == NULL_RTX || *offset == NULL_RTX)
14590     fusion = SCHED_FUSION_NONE;
14591
14592   return fusion;
14593 }
14594
14595 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14596
14597    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14598    and PRI are only calculated for these instructions.  For other instruction,
14599    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
14600    type instruction fusion can be added by returning different priorities.
14601
14602    It's important that irrelevant instructions get the largest FUSION_PRI.  */
14603
14604 static void
14605 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14606                                int *fusion_pri, int *pri)
14607 {
14608   int tmp, off_val;
14609   rtx base, offset;
14610   enum sched_fusion_type fusion;
14611
14612   gcc_assert (INSN_P (insn));
14613
14614   tmp = max_pri - 1;
14615   fusion = fusion_load_store (insn, &base, &offset);
14616   if (fusion == SCHED_FUSION_NONE)
14617     {
14618       *pri = tmp;
14619       *fusion_pri = tmp;
14620       return;
14621     }
14622
14623   /* Set FUSION_PRI according to fusion type and base register.  */
14624   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14625
14626   /* Calculate PRI.  */
14627   tmp /= 2;
14628
14629   /* INSN with smaller offset goes first.  */
14630   off_val = (int)(INTVAL (offset));
14631   if (off_val >= 0)
14632     tmp -= (off_val & 0xfffff);
14633   else
14634     tmp += ((- off_val) & 0xfffff);
14635
14636   *pri = tmp;
14637   return;
14638 }
14639
14640 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14641    Adjust priority of sha1h instructions so they are scheduled before
14642    other SHA1 instructions.  */
14643
14644 static int
14645 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14646 {
14647   rtx x = PATTERN (insn);
14648
14649   if (GET_CODE (x) == SET)
14650     {
14651       x = SET_SRC (x);
14652
14653       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14654         return priority + 10;
14655     }
14656
14657   return priority;
14658 }
14659
14660 /* Given OPERANDS of consecutive load/store, check if we can merge
14661    them into ldp/stp.  LOAD is true if they are load instructions.
14662    MODE is the mode of memory operands.  */
14663
14664 bool
14665 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14666                                 machine_mode mode)
14667 {
14668   HOST_WIDE_INT offval_1, offval_2, msize;
14669   enum reg_class rclass_1, rclass_2;
14670   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14671
14672   if (load)
14673     {
14674       mem_1 = operands[1];
14675       mem_2 = operands[3];
14676       reg_1 = operands[0];
14677       reg_2 = operands[2];
14678       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14679       if (REGNO (reg_1) == REGNO (reg_2))
14680         return false;
14681     }
14682   else
14683     {
14684       mem_1 = operands[0];
14685       mem_2 = operands[2];
14686       reg_1 = operands[1];
14687       reg_2 = operands[3];
14688     }
14689
14690   /* The mems cannot be volatile.  */
14691   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14692     return false;
14693
14694   /* If we have SImode and slow unaligned ldp,
14695      check the alignment to be at least 8 byte. */
14696   if (mode == SImode
14697       && (aarch64_tune_params.extra_tuning_flags
14698           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14699       && !optimize_size
14700       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14701     return false;
14702
14703   /* Check if the addresses are in the form of [base+offset].  */
14704   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14705   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14706     return false;
14707   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14708   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14709     return false;
14710
14711   /* Check if the bases are same.  */
14712   if (!rtx_equal_p (base_1, base_2))
14713     return false;
14714
14715   offval_1 = INTVAL (offset_1);
14716   offval_2 = INTVAL (offset_2);
14717   msize = GET_MODE_SIZE (mode);
14718   /* Check if the offsets are consecutive.  */
14719   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14720     return false;
14721
14722   /* Check if the addresses are clobbered by load.  */
14723   if (load)
14724     {
14725       if (reg_mentioned_p (reg_1, mem_1))
14726         return false;
14727
14728       /* In increasing order, the last load can clobber the address.  */
14729       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14730       return false;
14731     }
14732
14733   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14734     rclass_1 = FP_REGS;
14735   else
14736     rclass_1 = GENERAL_REGS;
14737
14738   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14739     rclass_2 = FP_REGS;
14740   else
14741     rclass_2 = GENERAL_REGS;
14742
14743   /* Check if the registers are of same class.  */
14744   if (rclass_1 != rclass_2)
14745     return false;
14746
14747   return true;
14748 }
14749
14750 /* Given OPERANDS of consecutive load/store, check if we can merge
14751    them into ldp/stp by adjusting the offset.  LOAD is true if they
14752    are load instructions.  MODE is the mode of memory operands.
14753
14754    Given below consecutive stores:
14755
14756      str  w1, [xb, 0x100]
14757      str  w1, [xb, 0x104]
14758      str  w1, [xb, 0x108]
14759      str  w1, [xb, 0x10c]
14760
14761    Though the offsets are out of the range supported by stp, we can
14762    still pair them after adjusting the offset, like:
14763
14764      add  scratch, xb, 0x100
14765      stp  w1, w1, [scratch]
14766      stp  w1, w1, [scratch, 0x8]
14767
14768    The peephole patterns detecting this opportunity should guarantee
14769    the scratch register is avaliable.  */
14770
14771 bool
14772 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14773                                        machine_mode mode)
14774 {
14775   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14776   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14777   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14778   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14779
14780   if (load)
14781     {
14782       reg_1 = operands[0];
14783       mem_1 = operands[1];
14784       reg_2 = operands[2];
14785       mem_2 = operands[3];
14786       reg_3 = operands[4];
14787       mem_3 = operands[5];
14788       reg_4 = operands[6];
14789       mem_4 = operands[7];
14790       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14791                   && REG_P (reg_3) && REG_P (reg_4));
14792       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14793         return false;
14794     }
14795   else
14796     {
14797       mem_1 = operands[0];
14798       reg_1 = operands[1];
14799       mem_2 = operands[2];
14800       reg_2 = operands[3];
14801       mem_3 = operands[4];
14802       reg_3 = operands[5];
14803       mem_4 = operands[6];
14804       reg_4 = operands[7];
14805     }
14806   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14807   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14808     return false;
14809
14810   /* The mems cannot be volatile.  */
14811   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14812       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14813     return false;
14814
14815   /* Check if the addresses are in the form of [base+offset].  */
14816   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14817   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14818     return false;
14819   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14820   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14821     return false;
14822   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14823   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14824     return false;
14825   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14826   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14827     return false;
14828
14829   /* Check if the bases are same.  */
14830   if (!rtx_equal_p (base_1, base_2)
14831       || !rtx_equal_p (base_2, base_3)
14832       || !rtx_equal_p (base_3, base_4))
14833     return false;
14834
14835   offval_1 = INTVAL (offset_1);
14836   offval_2 = INTVAL (offset_2);
14837   offval_3 = INTVAL (offset_3);
14838   offval_4 = INTVAL (offset_4);
14839   msize = GET_MODE_SIZE (mode);
14840   /* Check if the offsets are consecutive.  */
14841   if ((offval_1 != (offval_2 + msize)
14842        || offval_1 != (offval_3 + msize * 2)
14843        || offval_1 != (offval_4 + msize * 3))
14844       && (offval_4 != (offval_3 + msize)
14845           || offval_4 != (offval_2 + msize * 2)
14846           || offval_4 != (offval_1 + msize * 3)))
14847     return false;
14848
14849   /* Check if the addresses are clobbered by load.  */
14850   if (load)
14851     {
14852       if (reg_mentioned_p (reg_1, mem_1)
14853           || reg_mentioned_p (reg_2, mem_2)
14854           || reg_mentioned_p (reg_3, mem_3))
14855         return false;
14856
14857       /* In increasing order, the last load can clobber the address.  */
14858       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14859         return false;
14860     }
14861
14862   /* If we have SImode and slow unaligned ldp,
14863      check the alignment to be at least 8 byte. */
14864   if (mode == SImode
14865       && (aarch64_tune_params.extra_tuning_flags
14866           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14867       && !optimize_size
14868       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14869     return false;
14870
14871   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14872     rclass_1 = FP_REGS;
14873   else
14874     rclass_1 = GENERAL_REGS;
14875
14876   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14877     rclass_2 = FP_REGS;
14878   else
14879     rclass_2 = GENERAL_REGS;
14880
14881   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14882     rclass_3 = FP_REGS;
14883   else
14884     rclass_3 = GENERAL_REGS;
14885
14886   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14887     rclass_4 = FP_REGS;
14888   else
14889     rclass_4 = GENERAL_REGS;
14890
14891   /* Check if the registers are of same class.  */
14892   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14893     return false;
14894
14895   return true;
14896 }
14897
14898 /* Given OPERANDS of consecutive load/store, this function pairs them
14899    into ldp/stp after adjusting the offset.  It depends on the fact
14900    that addresses of load/store instructions are in increasing order.
14901    MODE is the mode of memory operands.  CODE is the rtl operator
14902    which should be applied to all memory operands, it's SIGN_EXTEND,
14903    ZERO_EXTEND or UNKNOWN.  */
14904
14905 bool
14906 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14907                              machine_mode mode, RTX_CODE code)
14908 {
14909   rtx base, offset, t1, t2;
14910   rtx mem_1, mem_2, mem_3, mem_4;
14911   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14912
14913   if (load)
14914     {
14915       mem_1 = operands[1];
14916       mem_2 = operands[3];
14917       mem_3 = operands[5];
14918       mem_4 = operands[7];
14919     }
14920   else
14921     {
14922       mem_1 = operands[0];
14923       mem_2 = operands[2];
14924       mem_3 = operands[4];
14925       mem_4 = operands[6];
14926       gcc_assert (code == UNKNOWN);
14927     }
14928
14929   extract_base_offset_in_addr (mem_1, &base, &offset);
14930   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14931
14932   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14933   msize = GET_MODE_SIZE (mode);
14934   stp_off_limit = msize * 0x40;
14935   off_val = INTVAL (offset);
14936   abs_off = (off_val < 0) ? -off_val : off_val;
14937   new_off = abs_off % stp_off_limit;
14938   adj_off = abs_off - new_off;
14939
14940   /* Further adjust to make sure all offsets are OK.  */
14941   if ((new_off + msize * 2) >= stp_off_limit)
14942     {
14943       adj_off += stp_off_limit;
14944       new_off -= stp_off_limit;
14945     }
14946
14947   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14948   if (adj_off >= 0x1000)
14949     return false;
14950
14951   if (off_val < 0)
14952     {
14953       adj_off = -adj_off;
14954       new_off = -new_off;
14955     }
14956
14957   /* Create new memory references.  */
14958   mem_1 = change_address (mem_1, VOIDmode,
14959                           plus_constant (DImode, operands[8], new_off));
14960
14961   /* Check if the adjusted address is OK for ldp/stp.  */
14962   if (!aarch64_mem_pair_operand (mem_1, mode))
14963     return false;
14964
14965   msize = GET_MODE_SIZE (mode);
14966   mem_2 = change_address (mem_2, VOIDmode,
14967                           plus_constant (DImode,
14968                                          operands[8],
14969                                          new_off + msize));
14970   mem_3 = change_address (mem_3, VOIDmode,
14971                           plus_constant (DImode,
14972                                          operands[8],
14973                                          new_off + msize * 2));
14974   mem_4 = change_address (mem_4, VOIDmode,
14975                           plus_constant (DImode,
14976                                          operands[8],
14977                                          new_off + msize * 3));
14978
14979   if (code == ZERO_EXTEND)
14980     {
14981       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14982       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14983       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14984       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14985     }
14986   else if (code == SIGN_EXTEND)
14987     {
14988       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14989       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14990       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14991       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14992     }
14993
14994   if (load)
14995     {
14996       operands[1] = mem_1;
14997       operands[3] = mem_2;
14998       operands[5] = mem_3;
14999       operands[7] = mem_4;
15000     }
15001   else
15002     {
15003       operands[0] = mem_1;
15004       operands[2] = mem_2;
15005       operands[4] = mem_3;
15006       operands[6] = mem_4;
15007     }
15008
15009   /* Emit adjusting instruction.  */
15010   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
15011   /* Emit ldp/stp instructions.  */
15012   t1 = gen_rtx_SET (operands[0], operands[1]);
15013   t2 = gen_rtx_SET (operands[2], operands[3]);
15014   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15015   t1 = gen_rtx_SET (operands[4], operands[5]);
15016   t2 = gen_rtx_SET (operands[6], operands[7]);
15017   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
15018   return true;
15019 }
15020
15021 /* Return 1 if pseudo register should be created and used to hold
15022    GOT address for PIC code.  */
15023
15024 bool
15025 aarch64_use_pseudo_pic_reg (void)
15026 {
15027   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
15028 }
15029
15030 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
15031
15032 static int
15033 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
15034 {
15035   switch (XINT (x, 1))
15036     {
15037     case UNSPEC_GOTSMALLPIC:
15038     case UNSPEC_GOTSMALLPIC28K:
15039     case UNSPEC_GOTTINYPIC:
15040       return 0;
15041     default:
15042       break;
15043     }
15044
15045   return default_unspec_may_trap_p (x, flags);
15046 }
15047
15048
15049 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15050    return the log2 of that value.  Otherwise return -1.  */
15051
15052 int
15053 aarch64_fpconst_pow_of_2 (rtx x)
15054 {
15055   const REAL_VALUE_TYPE *r;
15056
15057   if (!CONST_DOUBLE_P (x))
15058     return -1;
15059
15060   r = CONST_DOUBLE_REAL_VALUE (x);
15061
15062   if (REAL_VALUE_NEGATIVE (*r)
15063       || REAL_VALUE_ISNAN (*r)
15064       || REAL_VALUE_ISINF (*r)
15065       || !real_isinteger (r, DFmode))
15066     return -1;
15067
15068   return exact_log2 (real_to_integer (r));
15069 }
15070
15071 /* If X is a vector of equal CONST_DOUBLE values and that value is
15072    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
15073
15074 int
15075 aarch64_vec_fpconst_pow_of_2 (rtx x)
15076 {
15077   if (GET_CODE (x) != CONST_VECTOR)
15078     return -1;
15079
15080   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
15081     return -1;
15082
15083   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
15084   if (firstval <= 0)
15085     return -1;
15086
15087   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
15088     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
15089       return -1;
15090
15091   return firstval;
15092 }
15093
15094 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15095    to float.
15096
15097    __fp16 always promotes through this hook.
15098    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15099    through the generic excess precision logic rather than here.  */
15100
15101 static tree
15102 aarch64_promoted_type (const_tree t)
15103 {
15104   if (SCALAR_FLOAT_TYPE_P (t)
15105       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
15106     return float_type_node;
15107
15108   return NULL_TREE;
15109 }
15110
15111 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
15112
15113 static bool
15114 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
15115                            optimization_type opt_type)
15116 {
15117   switch (op)
15118     {
15119     case rsqrt_optab:
15120       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
15121
15122     default:
15123       return true;
15124     }
15125 }
15126
15127 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15128    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15129
15130 static bool
15131 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
15132 {
15133   return (mode == HFmode
15134           ? true
15135           : default_libgcc_floating_mode_supported_p (mode));
15136 }
15137
15138 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15139    if MODE is HFmode, and punt to the generic implementation otherwise.  */
15140
15141 static bool
15142 aarch64_scalar_mode_supported_p (machine_mode mode)
15143 {
15144   return (mode == HFmode
15145           ? true
15146           : default_scalar_mode_supported_p (mode));
15147 }
15148
15149 /* Set the value of FLT_EVAL_METHOD.
15150    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15151
15152     0: evaluate all operations and constants, whose semantic type has at
15153        most the range and precision of type float, to the range and
15154        precision of float; evaluate all other operations and constants to
15155        the range and precision of the semantic type;
15156
15157     N, where _FloatN is a supported interchange floating type
15158        evaluate all operations and constants, whose semantic type has at
15159        most the range and precision of _FloatN type, to the range and
15160        precision of the _FloatN type; evaluate all other operations and
15161        constants to the range and precision of the semantic type;
15162
15163    If we have the ARMv8.2-A extensions then we support _Float16 in native
15164    precision, so we should set this to 16.  Otherwise, we support the type,
15165    but want to evaluate expressions in float precision, so set this to
15166    0.  */
15167
15168 static enum flt_eval_method
15169 aarch64_excess_precision (enum excess_precision_type type)
15170 {
15171   switch (type)
15172     {
15173       case EXCESS_PRECISION_TYPE_FAST:
15174       case EXCESS_PRECISION_TYPE_STANDARD:
15175         /* We can calculate either in 16-bit range and precision or
15176            32-bit range and precision.  Make that decision based on whether
15177            we have native support for the ARMv8.2-A 16-bit floating-point
15178            instructions or not.  */
15179         return (TARGET_FP_F16INST
15180                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15181                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
15182       case EXCESS_PRECISION_TYPE_IMPLICIT:
15183         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
15184       default:
15185         gcc_unreachable ();
15186     }
15187   return FLT_EVAL_METHOD_UNPREDICTABLE;
15188 }
15189
15190 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
15191    scheduled for speculative execution.  Reject the long-running division
15192    and square-root instructions.  */
15193
15194 static bool
15195 aarch64_sched_can_speculate_insn (rtx_insn *insn)
15196 {
15197   switch (get_attr_type (insn))
15198     {
15199       case TYPE_SDIV:
15200       case TYPE_UDIV:
15201       case TYPE_FDIVS:
15202       case TYPE_FDIVD:
15203       case TYPE_FSQRTS:
15204       case TYPE_FSQRTD:
15205       case TYPE_NEON_FP_SQRT_S:
15206       case TYPE_NEON_FP_SQRT_D:
15207       case TYPE_NEON_FP_SQRT_S_Q:
15208       case TYPE_NEON_FP_SQRT_D_Q:
15209       case TYPE_NEON_FP_DIV_S:
15210       case TYPE_NEON_FP_DIV_D:
15211       case TYPE_NEON_FP_DIV_S_Q:
15212       case TYPE_NEON_FP_DIV_D_Q:
15213         return false;
15214       default:
15215         return true;
15216     }
15217 }
15218
15219 /* Target-specific selftests.  */
15220
15221 #if CHECKING_P
15222
15223 namespace selftest {
15224
15225 /* Selftest for the RTL loader.
15226    Verify that the RTL loader copes with a dump from
15227    print_rtx_function.  This is essentially just a test that class
15228    function_reader can handle a real dump, but it also verifies
15229    that lookup_reg_by_dump_name correctly handles hard regs.
15230    The presence of hard reg names in the dump means that the test is
15231    target-specific, hence it is in this file.  */
15232
15233 static void
15234 aarch64_test_loading_full_dump ()
15235 {
15236   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
15237
15238   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
15239
15240   rtx_insn *insn_1 = get_insn_by_uid (1);
15241   ASSERT_EQ (NOTE, GET_CODE (insn_1));
15242
15243   rtx_insn *insn_15 = get_insn_by_uid (15);
15244   ASSERT_EQ (INSN, GET_CODE (insn_15));
15245   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
15246
15247   /* Verify crtl->return_rtx.  */
15248   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
15249   ASSERT_EQ (0, REGNO (crtl->return_rtx));
15250   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
15251 }
15252
15253 /* Run all target-specific selftests.  */
15254
15255 static void
15256 aarch64_run_selftests (void)
15257 {
15258   aarch64_test_loading_full_dump ();
15259 }
15260
15261 } // namespace selftest
15262
15263 #endif /* #if CHECKING_P */
15264
15265 #undef TARGET_ADDRESS_COST
15266 #define TARGET_ADDRESS_COST aarch64_address_cost
15267
15268 /* This hook will determines whether unnamed bitfields affect the alignment
15269    of the containing structure.  The hook returns true if the structure
15270    should inherit the alignment requirements of an unnamed bitfield's
15271    type.  */
15272 #undef TARGET_ALIGN_ANON_BITFIELD
15273 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15274
15275 #undef TARGET_ASM_ALIGNED_DI_OP
15276 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15277
15278 #undef TARGET_ASM_ALIGNED_HI_OP
15279 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15280
15281 #undef TARGET_ASM_ALIGNED_SI_OP
15282 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15283
15284 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15285 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15286   hook_bool_const_tree_hwi_hwi_const_tree_true
15287
15288 #undef TARGET_ASM_FILE_START
15289 #define TARGET_ASM_FILE_START aarch64_start_file
15290
15291 #undef TARGET_ASM_OUTPUT_MI_THUNK
15292 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15293
15294 #undef TARGET_ASM_SELECT_RTX_SECTION
15295 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15296
15297 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15298 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15299
15300 #undef TARGET_BUILD_BUILTIN_VA_LIST
15301 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15302
15303 #undef TARGET_CALLEE_COPIES
15304 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15305
15306 #undef TARGET_CAN_ELIMINATE
15307 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15308
15309 #undef TARGET_CAN_INLINE_P
15310 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15311
15312 #undef TARGET_CANNOT_FORCE_CONST_MEM
15313 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15314
15315 #undef TARGET_CASE_VALUES_THRESHOLD
15316 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15317
15318 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15319 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15320
15321 /* Only the least significant bit is used for initialization guard
15322    variables.  */
15323 #undef TARGET_CXX_GUARD_MASK_BIT
15324 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15325
15326 #undef TARGET_C_MODE_FOR_SUFFIX
15327 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15328
15329 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15330 #undef  TARGET_DEFAULT_TARGET_FLAGS
15331 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15332 #endif
15333
15334 #undef TARGET_CLASS_MAX_NREGS
15335 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15336
15337 #undef TARGET_BUILTIN_DECL
15338 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15339
15340 #undef TARGET_BUILTIN_RECIPROCAL
15341 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15342
15343 #undef TARGET_C_EXCESS_PRECISION
15344 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15345
15346 #undef  TARGET_EXPAND_BUILTIN
15347 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15348
15349 #undef TARGET_EXPAND_BUILTIN_VA_START
15350 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15351
15352 #undef TARGET_FOLD_BUILTIN
15353 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15354
15355 #undef TARGET_FUNCTION_ARG
15356 #define TARGET_FUNCTION_ARG aarch64_function_arg
15357
15358 #undef TARGET_FUNCTION_ARG_ADVANCE
15359 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15360
15361 #undef TARGET_FUNCTION_ARG_BOUNDARY
15362 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15363
15364 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15365 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15366
15367 #undef TARGET_FUNCTION_VALUE
15368 #define TARGET_FUNCTION_VALUE aarch64_function_value
15369
15370 #undef TARGET_FUNCTION_VALUE_REGNO_P
15371 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15372
15373 #undef TARGET_FRAME_POINTER_REQUIRED
15374 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15375
15376 #undef TARGET_GIMPLE_FOLD_BUILTIN
15377 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15378
15379 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15380 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15381
15382 #undef  TARGET_INIT_BUILTINS
15383 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
15384
15385 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15386 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15387   aarch64_ira_change_pseudo_allocno_class
15388
15389 #undef TARGET_LEGITIMATE_ADDRESS_P
15390 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15391
15392 #undef TARGET_LEGITIMATE_CONSTANT_P
15393 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15394
15395 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15396 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15397   aarch64_legitimize_address_displacement
15398
15399 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15400 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15401
15402 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15403 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15404 aarch64_libgcc_floating_mode_supported_p
15405
15406 #undef TARGET_MANGLE_TYPE
15407 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15408
15409 #undef TARGET_MEMORY_MOVE_COST
15410 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15411
15412 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15413 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15414
15415 #undef TARGET_MUST_PASS_IN_STACK
15416 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15417
15418 /* This target hook should return true if accesses to volatile bitfields
15419    should use the narrowest mode possible.  It should return false if these
15420    accesses should use the bitfield container type.  */
15421 #undef TARGET_NARROW_VOLATILE_BITFIELD
15422 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15423
15424 #undef  TARGET_OPTION_OVERRIDE
15425 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15426
15427 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15428 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15429   aarch64_override_options_after_change
15430
15431 #undef TARGET_OPTION_SAVE
15432 #define TARGET_OPTION_SAVE aarch64_option_save
15433
15434 #undef TARGET_OPTION_RESTORE
15435 #define TARGET_OPTION_RESTORE aarch64_option_restore
15436
15437 #undef TARGET_OPTION_PRINT
15438 #define TARGET_OPTION_PRINT aarch64_option_print
15439
15440 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15441 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15442
15443 #undef TARGET_SET_CURRENT_FUNCTION
15444 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15445
15446 #undef TARGET_PASS_BY_REFERENCE
15447 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15448
15449 #undef TARGET_PREFERRED_RELOAD_CLASS
15450 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15451
15452 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15453 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15454
15455 #undef TARGET_PROMOTED_TYPE
15456 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15457
15458 #undef TARGET_SECONDARY_RELOAD
15459 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15460
15461 #undef TARGET_SHIFT_TRUNCATION_MASK
15462 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15463
15464 #undef TARGET_SETUP_INCOMING_VARARGS
15465 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15466
15467 #undef TARGET_STRUCT_VALUE_RTX
15468 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
15469
15470 #undef TARGET_REGISTER_MOVE_COST
15471 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15472
15473 #undef TARGET_RETURN_IN_MEMORY
15474 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15475
15476 #undef TARGET_RETURN_IN_MSB
15477 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15478
15479 #undef TARGET_RTX_COSTS
15480 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15481
15482 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15483 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15484
15485 #undef TARGET_SCHED_ISSUE_RATE
15486 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15487
15488 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15489 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15490   aarch64_sched_first_cycle_multipass_dfa_lookahead
15491
15492 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15493 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15494   aarch64_first_cycle_multipass_dfa_lookahead_guard
15495
15496 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15497 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15498   aarch64_get_separate_components
15499
15500 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15501 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15502   aarch64_components_for_bb
15503
15504 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15505 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15506   aarch64_disqualify_components
15507
15508 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15509 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15510   aarch64_emit_prologue_components
15511
15512 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15513 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15514   aarch64_emit_epilogue_components
15515
15516 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15517 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15518   aarch64_set_handled_components
15519
15520 #undef TARGET_TRAMPOLINE_INIT
15521 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15522
15523 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15524 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15525
15526 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15527 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15528
15529 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15530 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15531   aarch64_builtin_support_vector_misalignment
15532
15533 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15534 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15535
15536 #undef TARGET_VECTORIZE_ADD_STMT_COST
15537 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15538
15539 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15540 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15541   aarch64_builtin_vectorization_cost
15542
15543 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15544 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15545
15546 #undef TARGET_VECTORIZE_BUILTINS
15547 #define TARGET_VECTORIZE_BUILTINS
15548
15549 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15550 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15551   aarch64_builtin_vectorized_function
15552
15553 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15554 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15555   aarch64_autovectorize_vector_sizes
15556
15557 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15558 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15559   aarch64_atomic_assign_expand_fenv
15560
15561 /* Section anchor support.  */
15562
15563 #undef TARGET_MIN_ANCHOR_OFFSET
15564 #define TARGET_MIN_ANCHOR_OFFSET -256
15565
15566 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15567    byte offset; we can do much more for larger data types, but have no way
15568    to determine the size of the access.  We assume accesses are aligned.  */
15569 #undef TARGET_MAX_ANCHOR_OFFSET
15570 #define TARGET_MAX_ANCHOR_OFFSET 4095
15571
15572 #undef TARGET_VECTOR_ALIGNMENT
15573 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15574
15575 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15576 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15577   aarch64_simd_vector_alignment_reachable
15578
15579 /* vec_perm support.  */
15580
15581 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15582 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15583   aarch64_vectorize_vec_perm_const_ok
15584
15585 #undef TARGET_INIT_LIBFUNCS
15586 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15587
15588 #undef TARGET_FIXED_CONDITION_CODE_REGS
15589 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15590
15591 #undef TARGET_FLAGS_REGNUM
15592 #define TARGET_FLAGS_REGNUM CC_REGNUM
15593
15594 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15595 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15596
15597 #undef TARGET_ASAN_SHADOW_OFFSET
15598 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15599
15600 #undef TARGET_LEGITIMIZE_ADDRESS
15601 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15602
15603 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15604 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15605   aarch64_use_by_pieces_infrastructure_p
15606
15607 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15608 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15609
15610 #undef TARGET_CAN_USE_DOLOOP_P
15611 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15612
15613 #undef TARGET_SCHED_ADJUST_PRIORITY
15614 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15615
15616 #undef TARGET_SCHED_MACRO_FUSION_P
15617 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15618
15619 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15620 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15621
15622 #undef TARGET_SCHED_FUSION_PRIORITY
15623 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15624
15625 #undef TARGET_UNSPEC_MAY_TRAP_P
15626 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15627
15628 #undef TARGET_USE_PSEUDO_PIC_REG
15629 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15630
15631 #undef TARGET_PRINT_OPERAND
15632 #define TARGET_PRINT_OPERAND aarch64_print_operand
15633
15634 #undef TARGET_PRINT_OPERAND_ADDRESS
15635 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15636
15637 #undef TARGET_OPTAB_SUPPORTED_P
15638 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15639
15640 #undef TARGET_OMIT_STRUCT_RETURN_REG
15641 #define TARGET_OMIT_STRUCT_RETURN_REG true
15642
15643 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
15644 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15645 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15646
15647 #if CHECKING_P
15648 #undef TARGET_RUN_TARGET_SELFTESTS
15649 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15650 #endif /* #if CHECKING_P */
15651
15652 struct gcc_target targetm = TARGET_INITIALIZER;
15653
15654 #include "gt-aarch64.h"